From a200be1ffad4369a33803edf4132e86e910ff7ba Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sat, 17 Apr 2021 19:34:51 -0500 Subject: [PATCH 01/24] add literate jl files --- literate_notebooks/src-ES/01_constructors.jl | 143 +++++++++++ literate_notebooks/src-ES/02_basicinfo.jl | 76 ++++++ literate_notebooks/src-ES/03_missingvalues.jl | 112 +++++++++ literate_notebooks/src-ES/04_loadsave.jl | 64 +++++ literate_notebooks/src-ES/05_columns.jl | 187 ++++++++++++++ literate_notebooks/src-ES/06_rows.jl | 177 ++++++++++++++ literate_notebooks/src-ES/07_factors.jl | 231 ++++++++++++++++++ literate_notebooks/src-ES/08_joins.jl | 76 ++++++ literate_notebooks/src-ES/09_reshaping.jl | 90 +++++++ literate_notebooks/src-ES/10_transforms.jl | 80 ++++++ literate_notebooks/src-ES/11_performance.jl | 135 ++++++++++ literate_notebooks/src-ES/12_pitfalls.jl | 73 ++++++ literate_notebooks/src-ES/13_extras.jl | 198 +++++++++++++++ literate_notebooks/src/01_constructors.jl | 143 +++++++++++ literate_notebooks/src/02_basicinfo.jl | 76 ++++++ literate_notebooks/src/03_missingvalues.jl | 112 +++++++++ literate_notebooks/src/04_loadsave.jl | 64 +++++ literate_notebooks/src/05_columns.jl | 187 ++++++++++++++ literate_notebooks/src/06_rows.jl | 177 ++++++++++++++ literate_notebooks/src/07_factors.jl | 231 ++++++++++++++++++ literate_notebooks/src/08_joins.jl | 76 ++++++ literate_notebooks/src/09_reshaping.jl | 90 +++++++ literate_notebooks/src/10_transforms.jl | 80 ++++++ literate_notebooks/src/11_performance.jl | 135 ++++++++++ literate_notebooks/src/12_pitfalls.jl | 73 ++++++ literate_notebooks/src/13_extras.jl | 198 +++++++++++++++ 26 files changed, 3284 insertions(+) create mode 100644 literate_notebooks/src-ES/01_constructors.jl create mode 100644 literate_notebooks/src-ES/02_basicinfo.jl create mode 100644 literate_notebooks/src-ES/03_missingvalues.jl create mode 100644 literate_notebooks/src-ES/04_loadsave.jl create mode 100644 literate_notebooks/src-ES/05_columns.jl create mode 100644 literate_notebooks/src-ES/06_rows.jl create mode 100644 literate_notebooks/src-ES/07_factors.jl create mode 100644 literate_notebooks/src-ES/08_joins.jl create mode 100644 literate_notebooks/src-ES/09_reshaping.jl create mode 100644 literate_notebooks/src-ES/10_transforms.jl create mode 100644 literate_notebooks/src-ES/11_performance.jl create mode 100644 literate_notebooks/src-ES/12_pitfalls.jl create mode 100644 literate_notebooks/src-ES/13_extras.jl create mode 100644 literate_notebooks/src/01_constructors.jl create mode 100644 literate_notebooks/src/02_basicinfo.jl create mode 100644 literate_notebooks/src/03_missingvalues.jl create mode 100644 literate_notebooks/src/04_loadsave.jl create mode 100644 literate_notebooks/src/05_columns.jl create mode 100644 literate_notebooks/src/06_rows.jl create mode 100644 literate_notebooks/src/07_factors.jl create mode 100644 literate_notebooks/src/08_joins.jl create mode 100644 literate_notebooks/src/09_reshaping.jl create mode 100644 literate_notebooks/src/10_transforms.jl create mode 100644 literate_notebooks/src/11_performance.jl create mode 100644 literate_notebooks/src/12_pitfalls.jl create mode 100644 literate_notebooks/src/13_extras.jl diff --git a/literate_notebooks/src-ES/01_constructors.jl b/literate_notebooks/src-ES/01_constructors.jl new file mode 100644 index 0000000..333a81e --- /dev/null +++ b/literate_notebooks/src-ES/01_constructors.jl @@ -0,0 +1,143 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** +# +# Let's get started by loading the `DataFrames` package. + +using DataFrames + +# ## Constructors and conversion + +#- + +# ### Constructors +# +# In this section, you'll see many ways to create a `DataFrame` using the `DataFrame()` constructor. +# +# First, we could create an empty DataFrame, + +DataFrame() # empty DataFrame + +# Or we could call the constructor using keyword arguments to add columns to the `DataFrame`. + +DataFrame(A=1:3, B=rand(3), C=randstring.([3,3,3])) + +# We can create a `DataFrame` from a dictionary, in which case keys from the dictionary will be sorted to create the `DataFrame` columns. + +x = Dict("A" => [1,2], "B" => [true, false], "C" => ['a', 'b']) +DataFrame(x) + +# Rather than explicitly creating a dictionary first, as above, we could pass `DataFrame` arguments with the syntax of dictionary key-value pairs. +# +# Note that in this case, we use symbols to denote the column names and arguments are not sorted. For example, `:A`, the symbol, produces `A`, the name of the first column here: + +DataFrame(:A => [1,2], :B => [true, false], :C => ['a', 'b']) + +# Here we create a `DataFrame` from a vector of vectors, and each vector becomes a column. + +DataFrame([rand(3) for i in 1:3]) + +# For now we can construct a single `DataFrame` from a `Vector` of atoms, creating a `DataFrame` with a single row. In future releases of DataFrames.jl, this will throw an error. + +DataFrame(rand(3)) + +# Instead use a transposed vector if you have a vector of atoms (in this way you effectively pass a two dimensional array to the constructor which is supported). + +DataFrame(transpose([1, 2, 3])) + +# Pass a second argument to give the columns names. + +DataFrame([1:3, 4:6, 7:9], [:A, :B, :C]) + +# Here we create a `DataFrame` from a matrix, + +DataFrame(rand(3,4)) + +# and here we do the same but also pass column names. + +DataFrame(rand(3,4), Symbol.('a':'d')) + +# We can also construct an uninitialized DataFrame. +# +# Here we pass column types, names and number of rows; we get `missing` in column :C because `Any >: Missing`. + +DataFrame([Int, Float64, Any], [:A, :B, :C], 1) + +# Here we create a `DataFrame`, but column `:C` is #undef and Jupyter has problem with displaying it. (This works OK at the REPL.) +# +# This will be fixed in next release of DataFrames! + +DataFrame([Int, Float64, String], [:A, :B, :C], 1) + +# To initialize a `DataFrame` with column names, but no rows use + +DataFrame([Int, Float64, String], [:A, :B, :C], 0) + +# This syntax gives us a quick way to create homogenous `DataFrame`. + +DataFrame(Int, 3, 5) + +# This example is similar, but has nonhomogenous columns. + +DataFrame([Int, Float64], 4) + +# Finally, we can create a `DataFrame` by copying an existing `DataFrame`. +# +# Note that `copy` creates a shallow copy. + +y = DataFrame(x) +z = copy(x) +(x === y), (x === z), isequal(x, z) + +# ### Conversion to a matrix +# +# Let's start by creating a `DataFrame` with two rows and two columns. + +x = DataFrame(x=1:2, y=["A", "B"]) + +# We can create a matrix by passing this `DataFrame` to `Matrix`. + +Matrix(x) + +# This would work even if the `DataFrame` had some `missing`s: + +x = DataFrame(x=1:2, y=[missing,"B"]) + +#- + +Matrix(x) + +# In the two previous matrix examples, Julia created matrices with elements of type `Any`. We can see more clearly that the type of matrix is inferred when we pass, for example, a `DataFrame` of integers to `Matrix`, creating a 2D `Array` of `Int64`s: + +x = DataFrame(x=1:2, y=3:4) + +#- + +Matrix(x) + +# In this next example, Julia correctly identifies that `Union` is needed to express the type of the resulting `Matrix` (which contains `missing`s). + +x = DataFrame(x=1:2, y=[missing,4]) + +#- + +Matrix(x) + +# Note that we can't force a conversion of `missing` values to `Int`s! + +Matrix{Int}(x) + +# ### Handling of duplicate column names +# +# We can pass the `makeunique` keyword argument to allow passing duplicate names (they get deduplicated) + +df = DataFrame(:a=>1, :a=>2, :a_1=>3; makeunique=true) + +# Otherwise, duplicates will not be allowed in the future. + +df = DataFrame(:a=>1, :a=>2, :a_1=>3) + +# A constructor that is passed column names as keyword arguments is a corner case. +# You cannot pass `makeunique` to allow duplicates here. + +df = DataFrame(a=1, a=2, makeunique=true) + diff --git a/literate_notebooks/src-ES/02_basicinfo.jl b/literate_notebooks/src-ES/02_basicinfo.jl new file mode 100644 index 0000000..6cde7c6 --- /dev/null +++ b/literate_notebooks/src-ES/02_basicinfo.jl @@ -0,0 +1,76 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Getting basic information about a data frame +# +# Let's start by creating a `DataFrame` object, `x`, so that we can learn how to get information on that data frame. + +x = DataFrame(A = [1, 2], B = [1.0, missing], C = ["a", "b"]) + +# The standard `size` function works to get dimensions of the `DataFrame`, + +size(x), size(x, 1), size(x, 2) + +# as well as `nrow` and `ncol` from R; `length` gives number of columns. + +nrow(x), ncol(x), length(x) + +# `describe` gives basic summary statistics of data in your `DataFrame`. + +describe(x) + +# Use `showcols` to get informaton about columns stored in a DataFrame. + +showcols(x) + +# `names` will return the names of all columns, + +names(x) + +# and `eltypes` returns their types. + +eltypes(x) + +# Here we create some large DataFrame + +y = DataFrame(rand(1:10, 1000, 10)); + +# and then we can use `head` to peek into its top rows + +head(y) + +# and `tail` to see its bottom rows. + +tail(y, 3) + +# ### Most elementary get and set operations +# +# Given the `DataFrame`, `x`, here are three ways to grab one of its columns as a `Vector`: + +x[1], x[:A], x[:, 1] + +# To grab one row as a DataFrame, we can index as follows. + +x[1, :] + +# We can grab a single cell or element with the same syntax to grab an element of an array. + +x[1, 1] + +# Assignment can be done in ranges to a scalar, + +x[1:2, 1:2] = 1 +x + +# to a vector of length equal to the number of assigned rows, + +x[1:2, 1:2] = [1,2] +x + +# or to another data frame of matching size. + +x[1:2, 1:2] = DataFrame([5 6; 7 8]) +x + diff --git a/literate_notebooks/src-ES/03_missingvalues.jl b/literate_notebooks/src-ES/03_missingvalues.jl new file mode 100644 index 0000000..1e17d97 --- /dev/null +++ b/literate_notebooks/src-ES/03_missingvalues.jl @@ -0,0 +1,112 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Handling missing values +# +# A singelton type `Missings.Missing` allows us to deal with missing values. + +missing, typeof(missing) + +# Arrays automatically create an appropriate union type. + +x = [1, 2, missing, 3] + +# `ismissing` checks if passed value is missing. + +ismissing(1), ismissing(missing), ismissing(x), ismissing.(x) + +# We can extract the type combined with Missing from a `Union` via +# +# (This is useful for arrays!) + +eltype(x), Missings.T(eltype(x)) + +# `missing` comparisons produce `missing`. + +missing == missing, missing != missing, missing < missing + +# This is also true when `missing`s are compared with values of other types. + +1 == missing, 1 != missing, 1 < missing + +# `isequal`, `isless`, and `===` produce results of type `Bool`. + +isequal(missing, missing), missing === missing, isequal(1, missing), isless(1, missing) + +# In the next few examples, we see that many (not all) functions handle `missing`. + +map(x -> x(missing), [sin, cos, zero, sqrt]) # part 1 + +#- + +map(x -> x(missing, 1), [+, - , *, /, div]) # part 2 + +#- + +map(x -> x([1,2,missing]), [minimum, maximum, extrema, mean, any, float]) # part 3 + +# `skipmissing` returns iterator skipping missing values. We can use `collect` and `skipmissing` to create an array that excludes these missing values. + +collect(skipmissing([1, missing, 2, missing])) + +# Similarly, here we combine `collect` and `Missings.replace` to create an array that replaces all missing values with some value (`NaN` in this case). + +collect(Missings.replace([1.0, missing, 2.0, missing], NaN)) + +# Another way to do this: + +coalesce.([1.0, missing, 2.0, missing], NaN) + +# Caution: `nothing` would also be replaced here (for Julia 0.7 a more sophisticated behavior of `coalesce` that allows to avoid this problem is planned). + +coalesce.([1.0, missing, nothing, missing], NaN) + +# You can use `recode` if you have homogenous output types. + +recode([1.0, missing, 2.0, missing], missing=>NaN) + +# You can use `unique` or `levels` to get unique values with or without missings, respectively. + +unique([1, missing, 2, missing]), levels([1, missing, 2, missing]) + +# In this next example, we convert `x` to `y` with `allowmissing`, where `y` has a type that accepts missings. + +x = [1,2,3] +y = allowmissing(x) + +# Then, we convert back with `disallowmissing`. This would fail if `y` contained missing values! + +z = disallowmissing(y) +x,y,z + +# In this next example, we show that the type of each column in `x` is initially `Int64`. After using `allowmissing!` to accept missing values in columns 1 and 3, the types of those columns become `Union`s of `Int64` and `Missings.Missing`. + +x = DataFrame(Int, 2, 3) +println("Before: ", eltypes(x)) +allowmissing!(x, 1) # make first column accept missings +allowmissing!(x, :x3) # make :x3 column accept missings +println("After: ", eltypes(x)) + +# In this next example, we'll use `completecases` to find all the rows of a `DataFrame` that have complete data. + +x = DataFrame(A=[1, missing, 3, 4], B=["A", "B", missing, "C"]) +println(x) +println("Complete cases:\n", completecases(x)) + +# We can use `dropmissing` or `dropmissing!` to remove the rows with incomplete data from a `DataFrame` and either create a new `DataFrame` or mutate the original in-place. + +y = dropmissing(x) +dropmissing!(x) +[x, y] + +# When we call `showcols` on a `DataFrame` with dropped missing values, the columns still allow missing values. + +showcols(x) + +# Since we've excluded missing values, we can safely use `disallowmissing!` so that the columns will no longer accept missing values. + +disallowmissing!(x) +showcols(x) + diff --git a/literate_notebooks/src-ES/04_loadsave.jl b/literate_notebooks/src-ES/04_loadsave.jl new file mode 100644 index 0000000..d166830 --- /dev/null +++ b/literate_notebooks/src-ES/04_loadsave.jl @@ -0,0 +1,64 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Load and save DataFrames +# We do not cover all features of the packages. Please refer to their documentation to learn them. +# +# Here we'll load `CSV` to read and write CSV files and `JLD`, which allows us to work with a Julia native binary format. + +using CSV +using JLD + +# Let's create a simple `DataFrame` for testing purposes, + +x = DataFrame(A=[true, false, true], B=[1, 2, missing], + C=[missing, "b", "c"], D=['a', missing, 'c']) + + +# and use `eltypes` to look at the columnwise types. + +eltypes(x) + +# Let's use `CSV` to save `x` to disk; make sure `x.csv` does not conflict with some file in your working directory. + +CSV.write("x.csv", x) + +# Now we can see how it was saved by reading `x.csv`. + +print(read("x.csv", String)) + +# We can also load it back. `use_mmap=false` disables memory mapping so that on Windows the file can be deleted in the same session. + +y = CSV.read("x.csv", use_mmap=false) + +# When loading in a `DataFrame` from a `CSV`, all columns allow `Missing` by default. Note that the column types have changed! + +eltypes(y) + +# Now let's save `x` to a file in a binary format; make sure that `x.jld` does not exist in your working directory. + +save("x.jld", "x", x) + +# After loading in `x.jld` as `y`, `y` is identical to `x`. + +y = load("x.jld", "x") + +# Note that the column types of `y` are the same as those of `x`! + +eltypes(y) + +# Next, we'll create the files `bigdf.csv` and `bigdf.jld`, so be careful that you don't already have these files on disc! +# +# In particular, we'll time how long it takes us to write a `DataFrame` with 10^3 rows and 10^5 columns to `.csv` and `.jld` files. *You can expect JLD to be faster!* Use `compress=true` to reduce file sizes. + +bigdf = DataFrame(Bool, 10^3, 10^2) +@time CSV.write("bigdf.csv", bigdf) +@time save("bigdf.jld", "bigdf", bigdf) +getfield.(stat.(["bigdf.csv", "bigdf.jld"]), :size) + +# Finally, let's clean up. Do not run the next cell unless you are sure that it will not erase your important files. + +foreach(rm, ["x.csv", "x.jld", "bigdf.csv", "bigdf.jld"]) + diff --git a/literate_notebooks/src-ES/05_columns.jl b/literate_notebooks/src-ES/05_columns.jl new file mode 100644 index 0000000..f32e02a --- /dev/null +++ b/literate_notebooks/src-ES/05_columns.jl @@ -0,0 +1,187 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Manipulating columns of DataFrame + +#- + +# ### Renaming columns +# +# Let's start with a `DataFrame` of `Bool`s that has default column names. + +x = DataFrame(Bool, 3, 4) + +# With `rename`, we create new `DataFrame`; here we rename the column `:x1` to `:A`. (`rename` also accepts collections of Pairs.) + +rename(x, :x1 => :A) + +# With `rename!` we do an in place transformation. +# +# This time we've applied a function to every column name. + +rename!(c -> Symbol(string(c)^2), x) + +# We can also change the name of a particular column without knowing the original. +# +# Here we change the name of the third column, creating a new `DataFrame`. + +rename(x, names(x)[3] => :third) + +# With `names!`, we can change the names of all variables. + +names!(x, [:a, :b, :c, :d]) + +# We get an error when we try to provide duplicate names + +names!(x, fill(:a, 4)) + +# unless we pass `makeunique=true`, which allows us to handle duplicates in passed names. + +names!(x, fill(:a, 4), makeunique=true) + +# ### Reordering columns + +#- + +# We can reorder the names(x) vector as needed, creating a new DataFrame. + +srand(1234) +x[shuffle(names(x))] + +# also `permutecols!` will be introduced in next release of DataFrames + +#- + +# ### Merging/adding columns + +x = DataFrame([(i,j) for i in 1:3, j in 1:4]) + +# With `hcat` we can merge two `DataFrame`s. Also [x y] syntax is supported but only when DataFrames have unique column names. + +hcat(x, x, makeunique=true) + +# We can also use `hcat` to add a new column; a default name `:x1` will be used for this column, so `makeunique=true` is needed. + +y = hcat(x, [1,2,3], makeunique=true) + +# You can also prepend a vector with `hcat`. + +hcat([1,2,3], x, makeunique=true) + +# Alternatively you could append a vector with the following syntax. This is a bit more verbose but cleaner. + +y = [x DataFrame(A=[1,2,3])] + +# Here we do the same but add column `:A` to the front. + +y = [DataFrame(A=[1,2,3]) x] + +# A column can also be added in the middle. Here a brute-force method is used and a new DataFrame is created. + +using BenchmarkTools +@btime [$x[1:2] DataFrame(A=[1,2,3]) $x[3:4]] + +# We could also do this with a specialized in place method `insert!`. Let's add `:newcol` to the `DataFrame` `y`. + +insert!(y, 2, [1,2,3], :newcol) + +# If you want to insert the same column name several times `makeunique=true` is needed as usual. + +insert!(y, 2, [1,2,3], :newcol, makeunique=true) + +# We can see how much faster it is to insert a column with `insert!` than with `hcat` using `@btime`. + +@btime insert!(copy($x), 3, [1,2,3], :A) + +# Let's use `insert!` to append a column in place, + +insert!(x, ncol(x)+1, [1,2,3], :A) + +# and to in place prepend a column. + +insert!(x, 1, [1,2,3], :B) + +# With `merge!`, let's merge the second DataFrame into first, but overwriting duplicates. + +df1 = DataFrame(x=1:3, y=4:6) +df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13) +df1, df2, merge!(df1, df2) + +# For comparison: merge two `DataFrames`s but renaming duplicate names via `hcat`. + +df1 = DataFrame(x=1:3, y=4:6) +df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13) +hcat(df1, df2, makeunique=true) + +# ### Subsetting/removing columns +# +# Let's create a new `DataFrame` `x` and show a few ways to create DataFrames with a subset of `x`'s columns. + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# First we could do this by index + +x[[1,2,4,5]] + +# or by column name. + +x[[:x1, :x4]] + +# We can also choose to keep or exclude columns by `Bool`. (We need a vector whose length is the number of columns in the original `DataFrame`.) + +x[[true, false, true, false, true]] + +# Here we create a single column `DataFrame`, + +x[[:x1]] + +# and here we access the vector contained in column `:x1`. + +x[:x1] + +# We could grab the same vector by column number + +x[1] + +# and remove everything from a `DataFrame` with `empty!`. + +empty!(y) + +# Here we create a copy of `x` and delete the 3rd column from the copy with `delete!`. + +z = copy(x) +x, delete!(z, 3) + +# ### Modify column by name + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# With the following syntax, the existing column is modified without performing any copying. + +x[:x1] = x[:x2] +x + +# We can also use the following syntax to add a new column at the end of a `DataFrame`. + +x[:A] = [1,2,3] +x + +# A new column name will be added to our `DataFrame` with the following syntax as well (7 is equal to `ncol(x)+1`). + +x[7] = 11:13 +x + +# ### Find column name + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# We can check if a column with a given name exists via + +:x1 in names(x) + +# and determine its index via + +findfirst(names(x), :x2) + diff --git a/literate_notebooks/src-ES/06_rows.jl b/literate_notebooks/src-ES/06_rows.jl new file mode 100644 index 0000000..3660e40 --- /dev/null +++ b/literate_notebooks/src-ES/06_rows.jl @@ -0,0 +1,177 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package +srand(1); + +# ## Manipulating rows of DataFrame + +#- + +# ### Reordering rows + +x = DataFrame(id=1:10, x = rand(10), y = [zeros(5); ones(5)]) # and we hope that x[:x] is not sorted :) + +#- + +issorted(x), issorted(x, :x) # check if a DataFrame or a subset of its columns is sorted + +#- + +sort!(x, :x) # sort x in place + +#- + +y = sort(x, :id) # new DataFrame + +#- + +sort(x, (:y, :x), rev=(true, false)) # sort by two columns, first is decreasing, second is increasing + +#- + +sort(x, (order(:y, rev=true), :x)) # the same as above + +#- + +sort(x, (order(:y, rev=true), order(:x, by=v->-v))) # some more fancy sorting stuff + +#- + +x[shuffle(1:10), :] # reorder rows (here randomly) + +#- + +sort!(x, :id) +x[[1,10],:] = x[[10,1],:] # swap rows +x + +#- + +x[1,:], x[10,:] = x[10,:], x[1,:] # and swap again +x + +# ### Merging/adding rows + +x = DataFrame(rand(3, 5)) + +#- + +[x; x] # merge by rows - data frames must have the same column names; the same is vcat + +#- + +y = x[reverse(names(x))] # get y with other order of names + +#- + +vcat(x, y) # we get what we want as vcat does column name matching + +#- + +vcat(x, y[1:3]) # but column names must still match + +#- + +append!(x, x) # the same but modifies x + +#- + +append!(x, y) # here column names must match exactly + +#- + +push!(x, 1:5) # add one row to x at the end; must give correct number of values and correct types +x + +#- + +push!(x, Dict(:x1=> 11, :x2=> 12, :x3=> 13, :x4=> 14, :x5=> 15)) # also works with dictionaries +x + +# ### Subsetting/removing rows + +x = DataFrame(id=1:10, val='a':'j') + +#- + +x[1:2, :] # by index + +#- + +view(x, 1:2) # the same but a view + +#- + +x[repmat([true, false], 5), :] # by Bool, exact length required + +#- + +view(x, repmat([true, false], 5), :) # view again + +#- + +deleterows!(x, 7) # delete one row + +#- + +deleterows!(x, 6:7) # delete a collection of rows + +#- + +x = DataFrame([1:4, 2:5, 3:6]) + +#- + +filter(r -> r[:x1] > 2.5, x) # create a new DataFrame where filtering function operates on DataFrameRow + +#- + +## in place modification of x, an example with do-block syntax +filter!(x) do r + if r[:x1] > 2.5 + return r[:x2] < 4.5 + end + r[:x3] < 3.5 +end + +# ### Deduplicating + +x = DataFrame(A=[1,2], B=["x","y"]) +append!(x, x) +x[:C] = 1:4 +x + +#- + +unique(x, [1,2]) # get first unique rows for given index + +#- + +unique(x) # now we look at whole rows + +#- + +nonunique(x, :A) # get indicators of non-unique rows + +#- + +unique!(x, :B) # modify x in place + +# ### Extracting one row from `DataFrame` into a vector + +x = DataFrame(x=[1,missing,2], y=["a", "b", missing], z=[true,false,true]) + +#- + +cols = [:x, :y] +[x[1, col] for col in cols] # subset of columns + +#- + +[[x[i, col] for col in names(x)] for i in 1:nrow(x)] # vector of vectors, each entry contains one full row of x + +#- + +Tuple(x[1, col] for col in cols) # similar construct for Tuples, when ported to Julia 0.7 NamedTuples will be added + diff --git a/literate_notebooks/src-ES/07_factors.jl b/literate_notebooks/src-ES/07_factors.jl new file mode 100644 index 0000000..a3ff03c --- /dev/null +++ b/literate_notebooks/src-ES/07_factors.jl @@ -0,0 +1,231 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Working with CategoricalArrays + +#- + +# ### Constructor + +x = categorical(["A", "B", "B", "C"]) # unordered + +#- + +y = categorical(["A", "B", "B", "C"], ordered=true) # ordered, by default order is sorting order + +#- + +z = categorical(["A","B","B","C", missing]) # unordered with missings + +#- + +c = cut(1:10, 5) # ordered, into equal counts, possible to rename labels and give custom breaks + +#- + +by(DataFrame(x=cut(randn(100000), 10)), :x, d -> DataFrame(n=nrow(d)), sort=true) # just to make sure it works right + +#- + +v = categorical([1,2,2,3,3]) # contains integers not strings + +#- + +Vector{Union{String, Missing}}(z) # sometimes you need to convert back to a standard vector + +# ### Managing levels + +arr = [x,y,z,c,v] + +#- + +isordered.(arr) # chcek if categorical array is orderd + +#- + +ordered!(x, true), isordered(x) # make x ordered + +#- + +ordered!(x, false), isordered(x) # and unordered again + +#- + +levels.(arr) # list levels + +#- + +unique.(arr) # missing will be included + +#- + +y[1] < y[2] # can compare as y is ordered + +#- + +v[1] < v[2] # not comparable, v is unordered although it contains integers + +#- + +levels!(y, ["C", "B", "A"]) # you can reorder levels, mostly useful for ordered CategoricalArrays + +#- + +y[1] < y[2] # observe that the order is changed + +#- + +levels!(z, ["A", "B"]) # you have to specify all levels that are present + +#- + +levels!(z, ["A", "B"], allow_missing=true) # unless the underlying array allows for missings and force removal of levels + +#- + +z[1] = "B" +z # now z has only "B" entries + +#- + +levels(z) # but it remembers the levels it had (the reason is mostly performance) + +#- + +droplevels!(z) # this way we can clean it up +levels(z) + +# ### Data manipulation + +x, levels(x) + +#- + +x[2] = "0" +x, levels(x) # new level added at the end (works only for unordered) + +#- + +v, levels(v) + +#- + +v[1] + v[2] # even though underlying data is Int, we cannot operate on it + +#- + +Vector{Int}(v) # you have either to retrieve the data by conversion (may be expensive) + +#- + +get(v[1]) + get(v[2]) # or get a single value + +#- + +get.(v) # this will work for arrays witout missings + +#- + +get.(z) # but will fail on missing values + +#- + +Vector{Union{String, Missing}}(z) # you have to do the conversion + +#- + +z[1]*z[2], z.^2 # the only exception are CategoricalArrays based on String - you can operate on them normally + +#- + +recode([1,2,3,4,5,missing], 1=>10) # recode some values in an array; has also in place recode! equivalent + +#- + +recode([1,2,3,4,5,missing], "a", 1=>10, 2=>20) # here we provided a default value for not mapped recodings + +#- + +recode([1,2,3,4,5,missing], 1=>10, missing=>"missing") # to recode Missing you have to do it explicitly + +#- + +t = categorical([1:5; missing]) +t, levels(t) + +#- + +recode!(t, [1,3]=>2) +t, levels(t) # note that the levels are dropped after recode + +#- + +t = categorical([1,2,3], ordered=true) +levels(recode(t, 2=>0, 1=>-1)) # and if you introduce a new levels they are added at the end in the order of appearance + +#- + +t = categorical([1,2,3,4,5], ordered=true) # when using default it becomes the last level +levels(recode(t, 300, [1,2]=>100, 3=>200)) + +# ### Comparisons + +x = categorical([1,2,3]) +xs = [x, categorical(x), categorical(x, ordered=true), categorical(x, ordered=true)] +levels!(xs[2], [3,2,1]) +levels!(xs[4], [2,3,1]) +[a == b for a in xs, b in xs] # all are equal - comparison only by contents + +#- + +signature(x::CategoricalArray) = (x, levels(x), isordered(x)) # this is actually the full signature of CategoricalArray +## all are different, notice that x[1] and x[2] are unordered but have a different order of levels +[signature(a) == signature(b) for a in xs, b in xs] + +#- + +x[1] < x[2] # you cannot compare elements of unordered CategoricalArray + +#- + +t[1] < t[2] # but you can do it for an ordered one + +#- + +isless(x[1], x[2]) # isless works within the same CategoricalArray even if it is not ordered + +#- + +y = deepcopy(x) # but not across categorical arrays +isless(x[1], y[2]) + +#- + +isless(get(x[1]), get(y[2])) # you can use get to make a comparison of the contents of CategoricalArray + +#- + +x[1] == y[2] # equality tests works OK across CategoricalArrays + +# ### Categorical columns in a DataFrame + +df = DataFrame(x = 1:3, y = 'a':'c', z = ["a","b","c"]) + +#- + +categorical!(df) # converts all eltype(AbstractString) columns to categorical + +#- + +showcols(df) + +#- + +categorical!(df, :x) # manually convert to categorical column :x + +#- + +showcols(df) + diff --git a/literate_notebooks/src-ES/08_joins.jl b/literate_notebooks/src-ES/08_joins.jl new file mode 100644 index 0000000..e52bc22 --- /dev/null +++ b/literate_notebooks/src-ES/08_joins.jl @@ -0,0 +1,76 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2017** + +using DataFrames # load package + +# ## Joining DataFrames + +#- + +# ### Preparing DataFrames for a join + +x = DataFrame(ID=[1,2,3,4,missing], name = ["Alice", "Bob", "Conor", "Dave","Zed"]) +y = DataFrame(id=[1,2,5,6,missing], age = [21,22,23,24,99]) +x,y + +#- + +rename!(x, :ID=>:id) # names of columns on which we want to join must be the same + +# ### Standard joins: inner, left, right, outer, semi, anti + +join(x, y, on=:id) # :inner join by default, missing is joined + +#- + +join(x, y, on=:id, kind=:left) + +#- + +join(x, y, on=:id, kind=:right) + +#- + +join(x, y, on=:id, kind=:outer) + +#- + +join(x, y, on=:id, kind=:semi) + +#- + +join(x, y, on=:id, kind=:anti) + +# ### Cross join + +## cross-join does not require on argument +## it produces a Cartesian product or arguments +function expand_grid(;xs...) # a simple replacement for expand.grid in R + reduce((x,y) -> join(x, DataFrame(Pair(y...)), kind=:cross), + DataFrame(Pair(xs[1]...)), xs[2:end]) +end + +expand_grid(a=[1,2], b=["a","b","c"], c=[true,false]) + +# ### Complex cases of joins + +x = DataFrame(id1=[1,1,2,2,missing,missing], + id2=[1,11,2,21,missing,99], + name = ["Alice", "Bob", "Conor", "Dave","Zed", "Zoe"]) +y = DataFrame(id1=[1,1,3,3,missing,missing], + id2=[11,1,31,3,missing,999], + age = [21,22,23,24,99, 100]) +x,y + +#- + +join(x, y, on=[:id1, :id2]) # joining on two columns + +#- + +join(x, y, on=[:id1], makeunique=true) # with duplicates all combinations are produced (here :inner join) + +#- + +join(x, y, on=[:id1], kind=:semi) # but not by :semi join (as it would duplicate rows) + diff --git a/literate_notebooks/src-ES/09_reshaping.jl b/literate_notebooks/src-ES/09_reshaping.jl new file mode 100644 index 0000000..d6ec25b --- /dev/null +++ b/literate_notebooks/src-ES/09_reshaping.jl @@ -0,0 +1,90 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Reshaping DataFrames + +#- + +# ### Wide to long + +x = DataFrame(id=[1,2,3,4], id2=[1,1,2,2], M1=[11,12,13,14], M2=[111,112,113,114]) + +#- + +melt(x, :id, [:M1, :M2]) # first pass id-variables and then measure variables; meltdf makes a view + +#- + +## optionally you can rename columns; melt and stack are identical but order of arguments is reversed +stack(x, [:M1, :M2], :id, variable_name=:key, value_name=:observed) # first measures and then id-s; stackdf creates view + +#- + +## if second argument is omitted in melt or stack , all other columns are assumed to be the second argument +## but measure variables are selected only if they are <: AbstractFloat +melt(x, [:id, :id2]) + +#- + +melt(x, [1, 2]) # you can use index instead of symbol + +#- + +bigx = DataFrame(rand(10^6, 10)) # a test comparing creation of new DataFrame and a view +bigx[:id] = 1:10^6 +@time melt(bigx, :id) +@time melt(bigx, :id) +@time meltdf(bigx, :id) +@time meltdf(bigx, :id); + +#- + +x = DataFrame(id = [1,1,1], id2=['a','b','c'], a1 = rand(3), a2 = rand(3)) + +#- + +melt(x) + +#- + +melt(DataFrame(rand(3,2))) # by default stack and melt treats floats as value columns + +#- + +df = DataFrame(rand(3,2)) +df[:key] = [1,1,1] +mdf = melt(df) # duplicates in key are silently accepted + +# ### Long to wide + +x = DataFrame(id = [1,1,1], id2=['a','b','c'], a1 = rand(3), a2 = rand(3)) + +#- + +y = melt(x, [1,2]) +display(x) +display(y) + +#- + +unstack(y, :id2, :variable, :value) # stndard unstack with a unique key + +#- + +unstack(y, :variable, :value) # all other columns are treated as keys + +#- + +## by default :id, :variable and :value names are assumed; in this case it produces duplicate keys +unstack(y) + +#- + +df = stack(DataFrame(rand(3,2))) + +#- + +unstack(df, :variable, :value) # unable to unstack when no key column is present + diff --git a/literate_notebooks/src-ES/10_transforms.jl b/literate_notebooks/src-ES/10_transforms.jl new file mode 100644 index 0000000..3b5b4aa --- /dev/null +++ b/literate_notebooks/src-ES/10_transforms.jl @@ -0,0 +1,80 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Split-apply-combine + +x = DataFrame(id=[1,2,3,4,1,2,3,4], id2=[1,2,1,2,1,2,1,2], v=rand(8)) + +#- + +gx1 = groupby(x, :id) + +#- + +gx2 = groupby(x, [:id, :id2]) + +#- + +vcat(gx2...) # back to the original DataFrame + +#- + +x = DataFrame(id = [missing, 5, 1, 3, missing], x = 1:5) + +#- + +showall(groupby(x, :id)) # by default groups include mising values and are not sorted + +#- + +showall(groupby(x, :id, sort=true, skipmissing=true)) # but we can change it :) + +#- + +x = DataFrame(id=rand('a':'d', 100), v=rand(100)); +by(x, :id, y->mean(y[:v])) # apply a function to each group of a data frame + +#- + +by(x, :id, y->mean(y[:v]), sort=true) # we can sort the output + +#- + +by(x, :id, y->DataFrame(res=mean(y[:v]))) # this way we can set a name for a column - DataFramesMeta @by is better + +#- + +x = DataFrame(id=rand('a':'d', 100), x1=rand(100), x2=rand(100)) +aggregate(x, :id, sum) # apply a function over all columns of a data frame in groups given by id + +#- + +aggregate(x, :id, sum, sort=true) # also can be sorted + +# *We omit the discussion of of map/combine as I do not find them very useful (better to use by)* + +x = DataFrame(rand(3, 5)) + +#- + +map(mean, eachcol(x)) # map a function over each column and return a data frame + +#- + +foreach(c -> println(c[1], ": ", mean(c[2])), eachcol(x)) # a raw iteration returns a tuple with column name and values + +#- + +colwise(mean, x) # colwise is similar, but produces a vector + +#- + +x[:id] = [1,1,2] +colwise(mean,groupby(x, :id)) # and works on GroupedDataFrame + +#- + +map(r -> r[:x1]/r[:x2], eachrow(x)) # now the returned value is DataFrameRow which works similarly to a one-row DataFrame + diff --git a/literate_notebooks/src-ES/11_performance.jl b/literate_notebooks/src-ES/11_performance.jl new file mode 100644 index 0000000..005e877 --- /dev/null +++ b/literate_notebooks/src-ES/11_performance.jl @@ -0,0 +1,135 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames +using BenchmarkTools + +# ## Performance tips + +#- + +# ### Access by column number is faster than by name + +x = DataFrame(rand(5, 1000)) +@btime x[500]; +@btime x[:x500]; + +# ### When working with data `DataFrame` use barrier functions or type annotation + +function f_bad() # this function will be slow + srand(1); x = DataFrame(rand(1000000,2)) + y, z = x[1], x[2] + p = 0.0 + for i in 1:nrow(x) + p += y[i]*z[i] + end + p +end + +@btime f_bad(); + +#- + +@code_warntype f_bad() # the reason is that Julia does not know the types of columns in `DataFrame` + +#- + +## solution 1 is to use barrier function (it should be possible to use it in almost any code) +function f_inner(y,z) + p = 0.0 + for i in 1:length(y) + p += y[i]*z[i] + end + p +end + +function f_barrier() # extract the work to an inner function + srand(1); x = DataFrame(rand(1000000,2)) + f_inner(x[1], x[2]) +end + +function f_inbuilt() # or use inbuilt function if possible + srand(1); x = DataFrame(rand(1000000,2)) + dot(x[1], x[2]) +end + +@btime f_barrier(); +@btime f_inbuilt(); + +#- + +## solution 2 is to provide the types of extracted columns +## it is simpler but there are cases in which you will not know these types +function f_typed() + srand(1); x = DataFrame(rand(1000000,2)) + y::Vector{Float64}, z::Vector{Float64} = x[1], x[2] + p = 0.0 + for i in 1:nrow(x) + p += y[i]*z[i] + end + p +end + +@btime f_typed(); + +# ### Consider using delayed `DataFrame` creation technique + +function f1() + x = DataFrame(Float64, 10^4, 100) # we work with DataFrame directly + for c in 1:ncol(x) + d = x[c] + for r in 1:nrow(x) + d[r] = rand() + end + end + x +end + +function f2() + x = Vector{Any}(100) + for c in 1:length(x) + d = Vector{Float64}(10^4) + for r in 1:length(d) + d[r] = rand() + end + x[c] = d + end + DataFrame(x) # we delay creation of DataFrame after we have our job done +end + +@btime f1(); +@btime f2(); + +# ### You can add rows to a `DataFrame` in place and it is fast + +x = DataFrame(rand(10^6, 5)) +y = DataFrame(transpose(1.0:5.0)) +z = [1.0:5.0;] + +@btime vcat($x, $y); # creates a new DataFrame - slow +@btime append!($x, $y); # in place - fast + +x = DataFrame(rand(10^6, 5)) # reset to the same starting point +@btime push!($x, $z); # add a single row in place - fastest + +# ### Allowing `missing` as well as `categorical` slows down computations + +using StatsBase + +function test(data) # uses countmap function to test performance + println(eltype(data)) + x = rand(data, 10^6) + y = categorical(x) + println(" raw:") + @btime countmap($x) + println(" categorical:") + @btime countmap($y) + nothing +end + +test(1:10) +test([randstring() for i in 1:10]) +test(allowmissing(1:10)) +test(allowmissing([randstring() for i in 1:10])) + + diff --git a/literate_notebooks/src-ES/12_pitfalls.jl b/literate_notebooks/src-ES/12_pitfalls.jl new file mode 100644 index 0000000..8eb5e79 --- /dev/null +++ b/literate_notebooks/src-ES/12_pitfalls.jl @@ -0,0 +1,73 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames + +# ## Possible pitfalls + +#- + +# ### Know what is copied when creating a `DataFrame` + +x = DataFrame(rand(3, 5)) + +#- + +y = DataFrame(x) +x === y # no copyinng performed + +#- + +y = copy(x) +x === y # not the same object + +#- + +all(x[i] === y[i] for i in ncol(x)) # but the columns are the same + +#- + +x = 1:3; y = [1, 2, 3]; df = DataFrame(x=x,y=y) # the same when creating arrays or assigning columns, except ranges + +#- + +y === df[:y] # the same object + +#- + +typeof(x), typeof(df[:x]) # range is converted to a vector + +# ### Do not modify the parent of `GroupedDataFrame` + +x = DataFrame(id=repeat([1,2], outer=3), x=1:6) +g = groupby(x, :id) + +#- + +x[1:3, 1]=[2,2,2] +g # well - it is wrong now, g is only a view + +# ### Remember that you can filter columns of a `DataFrame` using booleans + +srand(1) +x = DataFrame(rand(5, 5)) + +#- + +x[x[:x1] .< 0.25] # well - we have filtered columns not rows by accident as you can select columns using booleans + +#- + +x[x[:x1] .< 0.25, :] # probably this is what we wanted + +# ### Column selection for DataFrame creates aliases unless explicitly copied + +x = DataFrame(a=1:3) +x[:b] = x[1] # alias +x[:c] = x[:, 1] # also alias +x[:d] = x[1][:] # copy +x[:e] = copy(x[1]) # explicit copy +display(x) +x[1,1] = 100 +display(x) + diff --git a/literate_notebooks/src-ES/13_extras.jl b/literate_notebooks/src-ES/13_extras.jl new file mode 100644 index 0000000..5140a31 --- /dev/null +++ b/literate_notebooks/src-ES/13_extras.jl @@ -0,0 +1,198 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 13, 2018** + +using DataFrames + +# ## Extras - selected functionalities of selected packages + +#- + +# ### FreqTables: creating cross tabulations + +using FreqTables +df = DataFrame(a=rand('a':'d', 1000), b=rand(["x", "y", "z"], 1000)) +ft = freqtable(df, :a, :b) # observe that dimensions are sorted if possible + +#- + +ft[1,1], ft['b', "z"] # you can index the result using numbers or names + +#- + +prop(ft, 1) # getting proportions - 1 means we want to calculate them in rows (first dimension) + +#- + +prop(ft, 2) # and columns are normalized to 1.0 now + +#- + +x = categorical(rand(1:3, 10)) +levels!(x, [3, 1, 2, 4]) # reordering levels and adding an extra level +freqtable(x) # order is preserved and not-used level is shown + +#- + +freqtable([1,1,2,3,missing]) # by default missings are listed + +#- + +freqtable([1,1,2,3,missing], skipmissing=true) # but we can skip them + +# ### DataFramesMeta - working on `DataFrame` + +using DataFramesMeta +df = DataFrame(x=1:8, y='a':'h', z=repeat([true,false], outer=4)) + +#- + +@with(df, :x+:z) # expressions with columns of DataFrame + +#- + +@with df begin # you can define code blocks + a = :x[:z] + b = :x[.!:z] + :y + [a; b] +end + +#- + +a # @with creates hard scope so variables do not leak out + +#- + +df2 = DataFrame(a = [:a, :b, :c]) +@with(df2, :a .== ^(:a)) # sometimes we want to work on raw Symbol, ^() escapes it + +#- + +df2 = DataFrame(x=1:3, y=4:6, z=7:9) +@with(df2, _I_(2:3)) # _I_(expression) is translated to df2[expression] + +#- + +@where(df, :x .< 4, :z .== true) # very useful macro for filtering + +#- + +@select(df, :x, y = 2*:x, z=:y) # create a new DataFrame based on the old one + +#- + +@transform(df, a=1, x = 2*:x, y=:x) # create a new DataFrame adding columns based on the old one + +#- + +@transform(df, a=1, b=:a) # old DataFrame is used and :a is not present there + +#- + +@orderby(df, :z, -:x) # sorting into a new data frame, less powerful than sort, but lightweight + +#- + +@linq df |> # chaining of operations on DataFrame + where(:x .< 5) |> + orderby(:z) |> + transform(x²=:x.^2) |> + select(:z, :x, :x²) + +#- + +f(df, col) = df[col] # you can define your own functions and put them in the chain +@linq df |> where(:x .<= 4) |> f(:x) + +# ### DataFramesMeta - working on grouped `DataFrame` + +df = DataFrame(a = 1:12, b = repeat('a':'d', outer=3)) +g = groupby(df, :b) + +#- + +@by(df, :b, first=first(:a), last=last(:a), mean=mean(:a)) # more convinient than by from DataFrames + +#- + +@based_on(g, first=first(:a), last=last(:a), mean=mean(:a)) # the same as by but on grouped DataFrame + +#- + +@where(g, mean(:a) > 6.5) # filter gropus on aggregate conditions + +#- + +@orderby(g, -sum(:a)) # order groups on aggregate conditions + +#- + +@transform(g, center = mean(:a), centered = :a - mean(:a)) # perform operations within a group and return ungroped DataFrame + +#- + +DataFrame(g) # a nice convinience function not defined in DataFrames + +#- + +@transform(g) # actually this is the same + +#- + +@linq df |> groupby(:b) |> where(mean(:a) > 6.5) |> DataFrame # you can do chaining on grouped DataFrames as well + +# ### DataFramesMeta - rowwise operations on `DataFrame` + +df = DataFrame(a = 1:12, b = repeat(1:4, outer=3)) + +#- + +## such conditions are often needed but are complex to write +@transform(df, x = ifelse.((:a .> 6) .& (:b .== 4), "yes", "no")) + +#- + +## one option is to use a function that works on a single observation and broadcast it +myfun(a, b) = a > 6 && b == 4 ? "yes" : "no" +@transform(df, x = myfun.(:a, :b)) + +#- + +## or you can use @byrow! macro that allows you to process DataFrame rowwise +@byrow! df begin + @newcol x::Vector{String} + :x = :a > 6 && :b == 4 ? "yes" : "no" +end + +# ### Visualizing data with StatPlots + +using StatPlots # you might need to setup Plots package and some plotting backend first + +#- + +## we present only a minimal functionality of the package + +#- + +srand(1) +df = DataFrame(x = sort(randn(1000)), y=randn(1000), z = [fill("b", 500); fill("a", 500)]) + +#- + +@df df plot(:x, :y, legend=:topleft, label="y(x)") # a most basic plot + +#- + +@df df density(:x, label="") # density plot + +#- + +@df df histogram(:y, label="y") # and a histogram + +#- + +@df df boxplot(:z, :x, label="x") + +#- + +@df df violin(:z, :y, label="y") + diff --git a/literate_notebooks/src/01_constructors.jl b/literate_notebooks/src/01_constructors.jl new file mode 100644 index 0000000..333a81e --- /dev/null +++ b/literate_notebooks/src/01_constructors.jl @@ -0,0 +1,143 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** +# +# Let's get started by loading the `DataFrames` package. + +using DataFrames + +# ## Constructors and conversion + +#- + +# ### Constructors +# +# In this section, you'll see many ways to create a `DataFrame` using the `DataFrame()` constructor. +# +# First, we could create an empty DataFrame, + +DataFrame() # empty DataFrame + +# Or we could call the constructor using keyword arguments to add columns to the `DataFrame`. + +DataFrame(A=1:3, B=rand(3), C=randstring.([3,3,3])) + +# We can create a `DataFrame` from a dictionary, in which case keys from the dictionary will be sorted to create the `DataFrame` columns. + +x = Dict("A" => [1,2], "B" => [true, false], "C" => ['a', 'b']) +DataFrame(x) + +# Rather than explicitly creating a dictionary first, as above, we could pass `DataFrame` arguments with the syntax of dictionary key-value pairs. +# +# Note that in this case, we use symbols to denote the column names and arguments are not sorted. For example, `:A`, the symbol, produces `A`, the name of the first column here: + +DataFrame(:A => [1,2], :B => [true, false], :C => ['a', 'b']) + +# Here we create a `DataFrame` from a vector of vectors, and each vector becomes a column. + +DataFrame([rand(3) for i in 1:3]) + +# For now we can construct a single `DataFrame` from a `Vector` of atoms, creating a `DataFrame` with a single row. In future releases of DataFrames.jl, this will throw an error. + +DataFrame(rand(3)) + +# Instead use a transposed vector if you have a vector of atoms (in this way you effectively pass a two dimensional array to the constructor which is supported). + +DataFrame(transpose([1, 2, 3])) + +# Pass a second argument to give the columns names. + +DataFrame([1:3, 4:6, 7:9], [:A, :B, :C]) + +# Here we create a `DataFrame` from a matrix, + +DataFrame(rand(3,4)) + +# and here we do the same but also pass column names. + +DataFrame(rand(3,4), Symbol.('a':'d')) + +# We can also construct an uninitialized DataFrame. +# +# Here we pass column types, names and number of rows; we get `missing` in column :C because `Any >: Missing`. + +DataFrame([Int, Float64, Any], [:A, :B, :C], 1) + +# Here we create a `DataFrame`, but column `:C` is #undef and Jupyter has problem with displaying it. (This works OK at the REPL.) +# +# This will be fixed in next release of DataFrames! + +DataFrame([Int, Float64, String], [:A, :B, :C], 1) + +# To initialize a `DataFrame` with column names, but no rows use + +DataFrame([Int, Float64, String], [:A, :B, :C], 0) + +# This syntax gives us a quick way to create homogenous `DataFrame`. + +DataFrame(Int, 3, 5) + +# This example is similar, but has nonhomogenous columns. + +DataFrame([Int, Float64], 4) + +# Finally, we can create a `DataFrame` by copying an existing `DataFrame`. +# +# Note that `copy` creates a shallow copy. + +y = DataFrame(x) +z = copy(x) +(x === y), (x === z), isequal(x, z) + +# ### Conversion to a matrix +# +# Let's start by creating a `DataFrame` with two rows and two columns. + +x = DataFrame(x=1:2, y=["A", "B"]) + +# We can create a matrix by passing this `DataFrame` to `Matrix`. + +Matrix(x) + +# This would work even if the `DataFrame` had some `missing`s: + +x = DataFrame(x=1:2, y=[missing,"B"]) + +#- + +Matrix(x) + +# In the two previous matrix examples, Julia created matrices with elements of type `Any`. We can see more clearly that the type of matrix is inferred when we pass, for example, a `DataFrame` of integers to `Matrix`, creating a 2D `Array` of `Int64`s: + +x = DataFrame(x=1:2, y=3:4) + +#- + +Matrix(x) + +# In this next example, Julia correctly identifies that `Union` is needed to express the type of the resulting `Matrix` (which contains `missing`s). + +x = DataFrame(x=1:2, y=[missing,4]) + +#- + +Matrix(x) + +# Note that we can't force a conversion of `missing` values to `Int`s! + +Matrix{Int}(x) + +# ### Handling of duplicate column names +# +# We can pass the `makeunique` keyword argument to allow passing duplicate names (they get deduplicated) + +df = DataFrame(:a=>1, :a=>2, :a_1=>3; makeunique=true) + +# Otherwise, duplicates will not be allowed in the future. + +df = DataFrame(:a=>1, :a=>2, :a_1=>3) + +# A constructor that is passed column names as keyword arguments is a corner case. +# You cannot pass `makeunique` to allow duplicates here. + +df = DataFrame(a=1, a=2, makeunique=true) + diff --git a/literate_notebooks/src/02_basicinfo.jl b/literate_notebooks/src/02_basicinfo.jl new file mode 100644 index 0000000..6cde7c6 --- /dev/null +++ b/literate_notebooks/src/02_basicinfo.jl @@ -0,0 +1,76 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Getting basic information about a data frame +# +# Let's start by creating a `DataFrame` object, `x`, so that we can learn how to get information on that data frame. + +x = DataFrame(A = [1, 2], B = [1.0, missing], C = ["a", "b"]) + +# The standard `size` function works to get dimensions of the `DataFrame`, + +size(x), size(x, 1), size(x, 2) + +# as well as `nrow` and `ncol` from R; `length` gives number of columns. + +nrow(x), ncol(x), length(x) + +# `describe` gives basic summary statistics of data in your `DataFrame`. + +describe(x) + +# Use `showcols` to get informaton about columns stored in a DataFrame. + +showcols(x) + +# `names` will return the names of all columns, + +names(x) + +# and `eltypes` returns their types. + +eltypes(x) + +# Here we create some large DataFrame + +y = DataFrame(rand(1:10, 1000, 10)); + +# and then we can use `head` to peek into its top rows + +head(y) + +# and `tail` to see its bottom rows. + +tail(y, 3) + +# ### Most elementary get and set operations +# +# Given the `DataFrame`, `x`, here are three ways to grab one of its columns as a `Vector`: + +x[1], x[:A], x[:, 1] + +# To grab one row as a DataFrame, we can index as follows. + +x[1, :] + +# We can grab a single cell or element with the same syntax to grab an element of an array. + +x[1, 1] + +# Assignment can be done in ranges to a scalar, + +x[1:2, 1:2] = 1 +x + +# to a vector of length equal to the number of assigned rows, + +x[1:2, 1:2] = [1,2] +x + +# or to another data frame of matching size. + +x[1:2, 1:2] = DataFrame([5 6; 7 8]) +x + diff --git a/literate_notebooks/src/03_missingvalues.jl b/literate_notebooks/src/03_missingvalues.jl new file mode 100644 index 0000000..1e17d97 --- /dev/null +++ b/literate_notebooks/src/03_missingvalues.jl @@ -0,0 +1,112 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Handling missing values +# +# A singelton type `Missings.Missing` allows us to deal with missing values. + +missing, typeof(missing) + +# Arrays automatically create an appropriate union type. + +x = [1, 2, missing, 3] + +# `ismissing` checks if passed value is missing. + +ismissing(1), ismissing(missing), ismissing(x), ismissing.(x) + +# We can extract the type combined with Missing from a `Union` via +# +# (This is useful for arrays!) + +eltype(x), Missings.T(eltype(x)) + +# `missing` comparisons produce `missing`. + +missing == missing, missing != missing, missing < missing + +# This is also true when `missing`s are compared with values of other types. + +1 == missing, 1 != missing, 1 < missing + +# `isequal`, `isless`, and `===` produce results of type `Bool`. + +isequal(missing, missing), missing === missing, isequal(1, missing), isless(1, missing) + +# In the next few examples, we see that many (not all) functions handle `missing`. + +map(x -> x(missing), [sin, cos, zero, sqrt]) # part 1 + +#- + +map(x -> x(missing, 1), [+, - , *, /, div]) # part 2 + +#- + +map(x -> x([1,2,missing]), [minimum, maximum, extrema, mean, any, float]) # part 3 + +# `skipmissing` returns iterator skipping missing values. We can use `collect` and `skipmissing` to create an array that excludes these missing values. + +collect(skipmissing([1, missing, 2, missing])) + +# Similarly, here we combine `collect` and `Missings.replace` to create an array that replaces all missing values with some value (`NaN` in this case). + +collect(Missings.replace([1.0, missing, 2.0, missing], NaN)) + +# Another way to do this: + +coalesce.([1.0, missing, 2.0, missing], NaN) + +# Caution: `nothing` would also be replaced here (for Julia 0.7 a more sophisticated behavior of `coalesce` that allows to avoid this problem is planned). + +coalesce.([1.0, missing, nothing, missing], NaN) + +# You can use `recode` if you have homogenous output types. + +recode([1.0, missing, 2.0, missing], missing=>NaN) + +# You can use `unique` or `levels` to get unique values with or without missings, respectively. + +unique([1, missing, 2, missing]), levels([1, missing, 2, missing]) + +# In this next example, we convert `x` to `y` with `allowmissing`, where `y` has a type that accepts missings. + +x = [1,2,3] +y = allowmissing(x) + +# Then, we convert back with `disallowmissing`. This would fail if `y` contained missing values! + +z = disallowmissing(y) +x,y,z + +# In this next example, we show that the type of each column in `x` is initially `Int64`. After using `allowmissing!` to accept missing values in columns 1 and 3, the types of those columns become `Union`s of `Int64` and `Missings.Missing`. + +x = DataFrame(Int, 2, 3) +println("Before: ", eltypes(x)) +allowmissing!(x, 1) # make first column accept missings +allowmissing!(x, :x3) # make :x3 column accept missings +println("After: ", eltypes(x)) + +# In this next example, we'll use `completecases` to find all the rows of a `DataFrame` that have complete data. + +x = DataFrame(A=[1, missing, 3, 4], B=["A", "B", missing, "C"]) +println(x) +println("Complete cases:\n", completecases(x)) + +# We can use `dropmissing` or `dropmissing!` to remove the rows with incomplete data from a `DataFrame` and either create a new `DataFrame` or mutate the original in-place. + +y = dropmissing(x) +dropmissing!(x) +[x, y] + +# When we call `showcols` on a `DataFrame` with dropped missing values, the columns still allow missing values. + +showcols(x) + +# Since we've excluded missing values, we can safely use `disallowmissing!` so that the columns will no longer accept missing values. + +disallowmissing!(x) +showcols(x) + diff --git a/literate_notebooks/src/04_loadsave.jl b/literate_notebooks/src/04_loadsave.jl new file mode 100644 index 0000000..d166830 --- /dev/null +++ b/literate_notebooks/src/04_loadsave.jl @@ -0,0 +1,64 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Load and save DataFrames +# We do not cover all features of the packages. Please refer to their documentation to learn them. +# +# Here we'll load `CSV` to read and write CSV files and `JLD`, which allows us to work with a Julia native binary format. + +using CSV +using JLD + +# Let's create a simple `DataFrame` for testing purposes, + +x = DataFrame(A=[true, false, true], B=[1, 2, missing], + C=[missing, "b", "c"], D=['a', missing, 'c']) + + +# and use `eltypes` to look at the columnwise types. + +eltypes(x) + +# Let's use `CSV` to save `x` to disk; make sure `x.csv` does not conflict with some file in your working directory. + +CSV.write("x.csv", x) + +# Now we can see how it was saved by reading `x.csv`. + +print(read("x.csv", String)) + +# We can also load it back. `use_mmap=false` disables memory mapping so that on Windows the file can be deleted in the same session. + +y = CSV.read("x.csv", use_mmap=false) + +# When loading in a `DataFrame` from a `CSV`, all columns allow `Missing` by default. Note that the column types have changed! + +eltypes(y) + +# Now let's save `x` to a file in a binary format; make sure that `x.jld` does not exist in your working directory. + +save("x.jld", "x", x) + +# After loading in `x.jld` as `y`, `y` is identical to `x`. + +y = load("x.jld", "x") + +# Note that the column types of `y` are the same as those of `x`! + +eltypes(y) + +# Next, we'll create the files `bigdf.csv` and `bigdf.jld`, so be careful that you don't already have these files on disc! +# +# In particular, we'll time how long it takes us to write a `DataFrame` with 10^3 rows and 10^5 columns to `.csv` and `.jld` files. *You can expect JLD to be faster!* Use `compress=true` to reduce file sizes. + +bigdf = DataFrame(Bool, 10^3, 10^2) +@time CSV.write("bigdf.csv", bigdf) +@time save("bigdf.jld", "bigdf", bigdf) +getfield.(stat.(["bigdf.csv", "bigdf.jld"]), :size) + +# Finally, let's clean up. Do not run the next cell unless you are sure that it will not erase your important files. + +foreach(rm, ["x.csv", "x.jld", "bigdf.csv", "bigdf.jld"]) + diff --git a/literate_notebooks/src/05_columns.jl b/literate_notebooks/src/05_columns.jl new file mode 100644 index 0000000..f32e02a --- /dev/null +++ b/literate_notebooks/src/05_columns.jl @@ -0,0 +1,187 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** + +using DataFrames # load package + +# ## Manipulating columns of DataFrame + +#- + +# ### Renaming columns +# +# Let's start with a `DataFrame` of `Bool`s that has default column names. + +x = DataFrame(Bool, 3, 4) + +# With `rename`, we create new `DataFrame`; here we rename the column `:x1` to `:A`. (`rename` also accepts collections of Pairs.) + +rename(x, :x1 => :A) + +# With `rename!` we do an in place transformation. +# +# This time we've applied a function to every column name. + +rename!(c -> Symbol(string(c)^2), x) + +# We can also change the name of a particular column without knowing the original. +# +# Here we change the name of the third column, creating a new `DataFrame`. + +rename(x, names(x)[3] => :third) + +# With `names!`, we can change the names of all variables. + +names!(x, [:a, :b, :c, :d]) + +# We get an error when we try to provide duplicate names + +names!(x, fill(:a, 4)) + +# unless we pass `makeunique=true`, which allows us to handle duplicates in passed names. + +names!(x, fill(:a, 4), makeunique=true) + +# ### Reordering columns + +#- + +# We can reorder the names(x) vector as needed, creating a new DataFrame. + +srand(1234) +x[shuffle(names(x))] + +# also `permutecols!` will be introduced in next release of DataFrames + +#- + +# ### Merging/adding columns + +x = DataFrame([(i,j) for i in 1:3, j in 1:4]) + +# With `hcat` we can merge two `DataFrame`s. Also [x y] syntax is supported but only when DataFrames have unique column names. + +hcat(x, x, makeunique=true) + +# We can also use `hcat` to add a new column; a default name `:x1` will be used for this column, so `makeunique=true` is needed. + +y = hcat(x, [1,2,3], makeunique=true) + +# You can also prepend a vector with `hcat`. + +hcat([1,2,3], x, makeunique=true) + +# Alternatively you could append a vector with the following syntax. This is a bit more verbose but cleaner. + +y = [x DataFrame(A=[1,2,3])] + +# Here we do the same but add column `:A` to the front. + +y = [DataFrame(A=[1,2,3]) x] + +# A column can also be added in the middle. Here a brute-force method is used and a new DataFrame is created. + +using BenchmarkTools +@btime [$x[1:2] DataFrame(A=[1,2,3]) $x[3:4]] + +# We could also do this with a specialized in place method `insert!`. Let's add `:newcol` to the `DataFrame` `y`. + +insert!(y, 2, [1,2,3], :newcol) + +# If you want to insert the same column name several times `makeunique=true` is needed as usual. + +insert!(y, 2, [1,2,3], :newcol, makeunique=true) + +# We can see how much faster it is to insert a column with `insert!` than with `hcat` using `@btime`. + +@btime insert!(copy($x), 3, [1,2,3], :A) + +# Let's use `insert!` to append a column in place, + +insert!(x, ncol(x)+1, [1,2,3], :A) + +# and to in place prepend a column. + +insert!(x, 1, [1,2,3], :B) + +# With `merge!`, let's merge the second DataFrame into first, but overwriting duplicates. + +df1 = DataFrame(x=1:3, y=4:6) +df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13) +df1, df2, merge!(df1, df2) + +# For comparison: merge two `DataFrames`s but renaming duplicate names via `hcat`. + +df1 = DataFrame(x=1:3, y=4:6) +df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13) +hcat(df1, df2, makeunique=true) + +# ### Subsetting/removing columns +# +# Let's create a new `DataFrame` `x` and show a few ways to create DataFrames with a subset of `x`'s columns. + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# First we could do this by index + +x[[1,2,4,5]] + +# or by column name. + +x[[:x1, :x4]] + +# We can also choose to keep or exclude columns by `Bool`. (We need a vector whose length is the number of columns in the original `DataFrame`.) + +x[[true, false, true, false, true]] + +# Here we create a single column `DataFrame`, + +x[[:x1]] + +# and here we access the vector contained in column `:x1`. + +x[:x1] + +# We could grab the same vector by column number + +x[1] + +# and remove everything from a `DataFrame` with `empty!`. + +empty!(y) + +# Here we create a copy of `x` and delete the 3rd column from the copy with `delete!`. + +z = copy(x) +x, delete!(z, 3) + +# ### Modify column by name + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# With the following syntax, the existing column is modified without performing any copying. + +x[:x1] = x[:x2] +x + +# We can also use the following syntax to add a new column at the end of a `DataFrame`. + +x[:A] = [1,2,3] +x + +# A new column name will be added to our `DataFrame` with the following syntax as well (7 is equal to `ncol(x)+1`). + +x[7] = 11:13 +x + +# ### Find column name + +x = DataFrame([(i,j) for i in 1:3, j in 1:5]) + +# We can check if a column with a given name exists via + +:x1 in names(x) + +# and determine its index via + +findfirst(names(x), :x2) + diff --git a/literate_notebooks/src/06_rows.jl b/literate_notebooks/src/06_rows.jl new file mode 100644 index 0000000..3660e40 --- /dev/null +++ b/literate_notebooks/src/06_rows.jl @@ -0,0 +1,177 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package +srand(1); + +# ## Manipulating rows of DataFrame + +#- + +# ### Reordering rows + +x = DataFrame(id=1:10, x = rand(10), y = [zeros(5); ones(5)]) # and we hope that x[:x] is not sorted :) + +#- + +issorted(x), issorted(x, :x) # check if a DataFrame or a subset of its columns is sorted + +#- + +sort!(x, :x) # sort x in place + +#- + +y = sort(x, :id) # new DataFrame + +#- + +sort(x, (:y, :x), rev=(true, false)) # sort by two columns, first is decreasing, second is increasing + +#- + +sort(x, (order(:y, rev=true), :x)) # the same as above + +#- + +sort(x, (order(:y, rev=true), order(:x, by=v->-v))) # some more fancy sorting stuff + +#- + +x[shuffle(1:10), :] # reorder rows (here randomly) + +#- + +sort!(x, :id) +x[[1,10],:] = x[[10,1],:] # swap rows +x + +#- + +x[1,:], x[10,:] = x[10,:], x[1,:] # and swap again +x + +# ### Merging/adding rows + +x = DataFrame(rand(3, 5)) + +#- + +[x; x] # merge by rows - data frames must have the same column names; the same is vcat + +#- + +y = x[reverse(names(x))] # get y with other order of names + +#- + +vcat(x, y) # we get what we want as vcat does column name matching + +#- + +vcat(x, y[1:3]) # but column names must still match + +#- + +append!(x, x) # the same but modifies x + +#- + +append!(x, y) # here column names must match exactly + +#- + +push!(x, 1:5) # add one row to x at the end; must give correct number of values and correct types +x + +#- + +push!(x, Dict(:x1=> 11, :x2=> 12, :x3=> 13, :x4=> 14, :x5=> 15)) # also works with dictionaries +x + +# ### Subsetting/removing rows + +x = DataFrame(id=1:10, val='a':'j') + +#- + +x[1:2, :] # by index + +#- + +view(x, 1:2) # the same but a view + +#- + +x[repmat([true, false], 5), :] # by Bool, exact length required + +#- + +view(x, repmat([true, false], 5), :) # view again + +#- + +deleterows!(x, 7) # delete one row + +#- + +deleterows!(x, 6:7) # delete a collection of rows + +#- + +x = DataFrame([1:4, 2:5, 3:6]) + +#- + +filter(r -> r[:x1] > 2.5, x) # create a new DataFrame where filtering function operates on DataFrameRow + +#- + +## in place modification of x, an example with do-block syntax +filter!(x) do r + if r[:x1] > 2.5 + return r[:x2] < 4.5 + end + r[:x3] < 3.5 +end + +# ### Deduplicating + +x = DataFrame(A=[1,2], B=["x","y"]) +append!(x, x) +x[:C] = 1:4 +x + +#- + +unique(x, [1,2]) # get first unique rows for given index + +#- + +unique(x) # now we look at whole rows + +#- + +nonunique(x, :A) # get indicators of non-unique rows + +#- + +unique!(x, :B) # modify x in place + +# ### Extracting one row from `DataFrame` into a vector + +x = DataFrame(x=[1,missing,2], y=["a", "b", missing], z=[true,false,true]) + +#- + +cols = [:x, :y] +[x[1, col] for col in cols] # subset of columns + +#- + +[[x[i, col] for col in names(x)] for i in 1:nrow(x)] # vector of vectors, each entry contains one full row of x + +#- + +Tuple(x[1, col] for col in cols) # similar construct for Tuples, when ported to Julia 0.7 NamedTuples will be added + diff --git a/literate_notebooks/src/07_factors.jl b/literate_notebooks/src/07_factors.jl new file mode 100644 index 0000000..a3ff03c --- /dev/null +++ b/literate_notebooks/src/07_factors.jl @@ -0,0 +1,231 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Working with CategoricalArrays + +#- + +# ### Constructor + +x = categorical(["A", "B", "B", "C"]) # unordered + +#- + +y = categorical(["A", "B", "B", "C"], ordered=true) # ordered, by default order is sorting order + +#- + +z = categorical(["A","B","B","C", missing]) # unordered with missings + +#- + +c = cut(1:10, 5) # ordered, into equal counts, possible to rename labels and give custom breaks + +#- + +by(DataFrame(x=cut(randn(100000), 10)), :x, d -> DataFrame(n=nrow(d)), sort=true) # just to make sure it works right + +#- + +v = categorical([1,2,2,3,3]) # contains integers not strings + +#- + +Vector{Union{String, Missing}}(z) # sometimes you need to convert back to a standard vector + +# ### Managing levels + +arr = [x,y,z,c,v] + +#- + +isordered.(arr) # chcek if categorical array is orderd + +#- + +ordered!(x, true), isordered(x) # make x ordered + +#- + +ordered!(x, false), isordered(x) # and unordered again + +#- + +levels.(arr) # list levels + +#- + +unique.(arr) # missing will be included + +#- + +y[1] < y[2] # can compare as y is ordered + +#- + +v[1] < v[2] # not comparable, v is unordered although it contains integers + +#- + +levels!(y, ["C", "B", "A"]) # you can reorder levels, mostly useful for ordered CategoricalArrays + +#- + +y[1] < y[2] # observe that the order is changed + +#- + +levels!(z, ["A", "B"]) # you have to specify all levels that are present + +#- + +levels!(z, ["A", "B"], allow_missing=true) # unless the underlying array allows for missings and force removal of levels + +#- + +z[1] = "B" +z # now z has only "B" entries + +#- + +levels(z) # but it remembers the levels it had (the reason is mostly performance) + +#- + +droplevels!(z) # this way we can clean it up +levels(z) + +# ### Data manipulation + +x, levels(x) + +#- + +x[2] = "0" +x, levels(x) # new level added at the end (works only for unordered) + +#- + +v, levels(v) + +#- + +v[1] + v[2] # even though underlying data is Int, we cannot operate on it + +#- + +Vector{Int}(v) # you have either to retrieve the data by conversion (may be expensive) + +#- + +get(v[1]) + get(v[2]) # or get a single value + +#- + +get.(v) # this will work for arrays witout missings + +#- + +get.(z) # but will fail on missing values + +#- + +Vector{Union{String, Missing}}(z) # you have to do the conversion + +#- + +z[1]*z[2], z.^2 # the only exception are CategoricalArrays based on String - you can operate on them normally + +#- + +recode([1,2,3,4,5,missing], 1=>10) # recode some values in an array; has also in place recode! equivalent + +#- + +recode([1,2,3,4,5,missing], "a", 1=>10, 2=>20) # here we provided a default value for not mapped recodings + +#- + +recode([1,2,3,4,5,missing], 1=>10, missing=>"missing") # to recode Missing you have to do it explicitly + +#- + +t = categorical([1:5; missing]) +t, levels(t) + +#- + +recode!(t, [1,3]=>2) +t, levels(t) # note that the levels are dropped after recode + +#- + +t = categorical([1,2,3], ordered=true) +levels(recode(t, 2=>0, 1=>-1)) # and if you introduce a new levels they are added at the end in the order of appearance + +#- + +t = categorical([1,2,3,4,5], ordered=true) # when using default it becomes the last level +levels(recode(t, 300, [1,2]=>100, 3=>200)) + +# ### Comparisons + +x = categorical([1,2,3]) +xs = [x, categorical(x), categorical(x, ordered=true), categorical(x, ordered=true)] +levels!(xs[2], [3,2,1]) +levels!(xs[4], [2,3,1]) +[a == b for a in xs, b in xs] # all are equal - comparison only by contents + +#- + +signature(x::CategoricalArray) = (x, levels(x), isordered(x)) # this is actually the full signature of CategoricalArray +## all are different, notice that x[1] and x[2] are unordered but have a different order of levels +[signature(a) == signature(b) for a in xs, b in xs] + +#- + +x[1] < x[2] # you cannot compare elements of unordered CategoricalArray + +#- + +t[1] < t[2] # but you can do it for an ordered one + +#- + +isless(x[1], x[2]) # isless works within the same CategoricalArray even if it is not ordered + +#- + +y = deepcopy(x) # but not across categorical arrays +isless(x[1], y[2]) + +#- + +isless(get(x[1]), get(y[2])) # you can use get to make a comparison of the contents of CategoricalArray + +#- + +x[1] == y[2] # equality tests works OK across CategoricalArrays + +# ### Categorical columns in a DataFrame + +df = DataFrame(x = 1:3, y = 'a':'c', z = ["a","b","c"]) + +#- + +categorical!(df) # converts all eltype(AbstractString) columns to categorical + +#- + +showcols(df) + +#- + +categorical!(df, :x) # manually convert to categorical column :x + +#- + +showcols(df) + diff --git a/literate_notebooks/src/08_joins.jl b/literate_notebooks/src/08_joins.jl new file mode 100644 index 0000000..e52bc22 --- /dev/null +++ b/literate_notebooks/src/08_joins.jl @@ -0,0 +1,76 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2017** + +using DataFrames # load package + +# ## Joining DataFrames + +#- + +# ### Preparing DataFrames for a join + +x = DataFrame(ID=[1,2,3,4,missing], name = ["Alice", "Bob", "Conor", "Dave","Zed"]) +y = DataFrame(id=[1,2,5,6,missing], age = [21,22,23,24,99]) +x,y + +#- + +rename!(x, :ID=>:id) # names of columns on which we want to join must be the same + +# ### Standard joins: inner, left, right, outer, semi, anti + +join(x, y, on=:id) # :inner join by default, missing is joined + +#- + +join(x, y, on=:id, kind=:left) + +#- + +join(x, y, on=:id, kind=:right) + +#- + +join(x, y, on=:id, kind=:outer) + +#- + +join(x, y, on=:id, kind=:semi) + +#- + +join(x, y, on=:id, kind=:anti) + +# ### Cross join + +## cross-join does not require on argument +## it produces a Cartesian product or arguments +function expand_grid(;xs...) # a simple replacement for expand.grid in R + reduce((x,y) -> join(x, DataFrame(Pair(y...)), kind=:cross), + DataFrame(Pair(xs[1]...)), xs[2:end]) +end + +expand_grid(a=[1,2], b=["a","b","c"], c=[true,false]) + +# ### Complex cases of joins + +x = DataFrame(id1=[1,1,2,2,missing,missing], + id2=[1,11,2,21,missing,99], + name = ["Alice", "Bob", "Conor", "Dave","Zed", "Zoe"]) +y = DataFrame(id1=[1,1,3,3,missing,missing], + id2=[11,1,31,3,missing,999], + age = [21,22,23,24,99, 100]) +x,y + +#- + +join(x, y, on=[:id1, :id2]) # joining on two columns + +#- + +join(x, y, on=[:id1], makeunique=true) # with duplicates all combinations are produced (here :inner join) + +#- + +join(x, y, on=[:id1], kind=:semi) # but not by :semi join (as it would duplicate rows) + diff --git a/literate_notebooks/src/09_reshaping.jl b/literate_notebooks/src/09_reshaping.jl new file mode 100644 index 0000000..d6ec25b --- /dev/null +++ b/literate_notebooks/src/09_reshaping.jl @@ -0,0 +1,90 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Reshaping DataFrames + +#- + +# ### Wide to long + +x = DataFrame(id=[1,2,3,4], id2=[1,1,2,2], M1=[11,12,13,14], M2=[111,112,113,114]) + +#- + +melt(x, :id, [:M1, :M2]) # first pass id-variables and then measure variables; meltdf makes a view + +#- + +## optionally you can rename columns; melt and stack are identical but order of arguments is reversed +stack(x, [:M1, :M2], :id, variable_name=:key, value_name=:observed) # first measures and then id-s; stackdf creates view + +#- + +## if second argument is omitted in melt or stack , all other columns are assumed to be the second argument +## but measure variables are selected only if they are <: AbstractFloat +melt(x, [:id, :id2]) + +#- + +melt(x, [1, 2]) # you can use index instead of symbol + +#- + +bigx = DataFrame(rand(10^6, 10)) # a test comparing creation of new DataFrame and a view +bigx[:id] = 1:10^6 +@time melt(bigx, :id) +@time melt(bigx, :id) +@time meltdf(bigx, :id) +@time meltdf(bigx, :id); + +#- + +x = DataFrame(id = [1,1,1], id2=['a','b','c'], a1 = rand(3), a2 = rand(3)) + +#- + +melt(x) + +#- + +melt(DataFrame(rand(3,2))) # by default stack and melt treats floats as value columns + +#- + +df = DataFrame(rand(3,2)) +df[:key] = [1,1,1] +mdf = melt(df) # duplicates in key are silently accepted + +# ### Long to wide + +x = DataFrame(id = [1,1,1], id2=['a','b','c'], a1 = rand(3), a2 = rand(3)) + +#- + +y = melt(x, [1,2]) +display(x) +display(y) + +#- + +unstack(y, :id2, :variable, :value) # stndard unstack with a unique key + +#- + +unstack(y, :variable, :value) # all other columns are treated as keys + +#- + +## by default :id, :variable and :value names are assumed; in this case it produces duplicate keys +unstack(y) + +#- + +df = stack(DataFrame(rand(3,2))) + +#- + +unstack(df, :variable, :value) # unable to unstack when no key column is present + diff --git a/literate_notebooks/src/10_transforms.jl b/literate_notebooks/src/10_transforms.jl new file mode 100644 index 0000000..3b5b4aa --- /dev/null +++ b/literate_notebooks/src/10_transforms.jl @@ -0,0 +1,80 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames # load package + +# ## Split-apply-combine + +x = DataFrame(id=[1,2,3,4,1,2,3,4], id2=[1,2,1,2,1,2,1,2], v=rand(8)) + +#- + +gx1 = groupby(x, :id) + +#- + +gx2 = groupby(x, [:id, :id2]) + +#- + +vcat(gx2...) # back to the original DataFrame + +#- + +x = DataFrame(id = [missing, 5, 1, 3, missing], x = 1:5) + +#- + +showall(groupby(x, :id)) # by default groups include mising values and are not sorted + +#- + +showall(groupby(x, :id, sort=true, skipmissing=true)) # but we can change it :) + +#- + +x = DataFrame(id=rand('a':'d', 100), v=rand(100)); +by(x, :id, y->mean(y[:v])) # apply a function to each group of a data frame + +#- + +by(x, :id, y->mean(y[:v]), sort=true) # we can sort the output + +#- + +by(x, :id, y->DataFrame(res=mean(y[:v]))) # this way we can set a name for a column - DataFramesMeta @by is better + +#- + +x = DataFrame(id=rand('a':'d', 100), x1=rand(100), x2=rand(100)) +aggregate(x, :id, sum) # apply a function over all columns of a data frame in groups given by id + +#- + +aggregate(x, :id, sum, sort=true) # also can be sorted + +# *We omit the discussion of of map/combine as I do not find them very useful (better to use by)* + +x = DataFrame(rand(3, 5)) + +#- + +map(mean, eachcol(x)) # map a function over each column and return a data frame + +#- + +foreach(c -> println(c[1], ": ", mean(c[2])), eachcol(x)) # a raw iteration returns a tuple with column name and values + +#- + +colwise(mean, x) # colwise is similar, but produces a vector + +#- + +x[:id] = [1,1,2] +colwise(mean,groupby(x, :id)) # and works on GroupedDataFrame + +#- + +map(r -> r[:x1]/r[:x2], eachrow(x)) # now the returned value is DataFrameRow which works similarly to a one-row DataFrame + diff --git a/literate_notebooks/src/11_performance.jl b/literate_notebooks/src/11_performance.jl new file mode 100644 index 0000000..005e877 --- /dev/null +++ b/literate_notebooks/src/11_performance.jl @@ -0,0 +1,135 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames +using BenchmarkTools + +# ## Performance tips + +#- + +# ### Access by column number is faster than by name + +x = DataFrame(rand(5, 1000)) +@btime x[500]; +@btime x[:x500]; + +# ### When working with data `DataFrame` use barrier functions or type annotation + +function f_bad() # this function will be slow + srand(1); x = DataFrame(rand(1000000,2)) + y, z = x[1], x[2] + p = 0.0 + for i in 1:nrow(x) + p += y[i]*z[i] + end + p +end + +@btime f_bad(); + +#- + +@code_warntype f_bad() # the reason is that Julia does not know the types of columns in `DataFrame` + +#- + +## solution 1 is to use barrier function (it should be possible to use it in almost any code) +function f_inner(y,z) + p = 0.0 + for i in 1:length(y) + p += y[i]*z[i] + end + p +end + +function f_barrier() # extract the work to an inner function + srand(1); x = DataFrame(rand(1000000,2)) + f_inner(x[1], x[2]) +end + +function f_inbuilt() # or use inbuilt function if possible + srand(1); x = DataFrame(rand(1000000,2)) + dot(x[1], x[2]) +end + +@btime f_barrier(); +@btime f_inbuilt(); + +#- + +## solution 2 is to provide the types of extracted columns +## it is simpler but there are cases in which you will not know these types +function f_typed() + srand(1); x = DataFrame(rand(1000000,2)) + y::Vector{Float64}, z::Vector{Float64} = x[1], x[2] + p = 0.0 + for i in 1:nrow(x) + p += y[i]*z[i] + end + p +end + +@btime f_typed(); + +# ### Consider using delayed `DataFrame` creation technique + +function f1() + x = DataFrame(Float64, 10^4, 100) # we work with DataFrame directly + for c in 1:ncol(x) + d = x[c] + for r in 1:nrow(x) + d[r] = rand() + end + end + x +end + +function f2() + x = Vector{Any}(100) + for c in 1:length(x) + d = Vector{Float64}(10^4) + for r in 1:length(d) + d[r] = rand() + end + x[c] = d + end + DataFrame(x) # we delay creation of DataFrame after we have our job done +end + +@btime f1(); +@btime f2(); + +# ### You can add rows to a `DataFrame` in place and it is fast + +x = DataFrame(rand(10^6, 5)) +y = DataFrame(transpose(1.0:5.0)) +z = [1.0:5.0;] + +@btime vcat($x, $y); # creates a new DataFrame - slow +@btime append!($x, $y); # in place - fast + +x = DataFrame(rand(10^6, 5)) # reset to the same starting point +@btime push!($x, $z); # add a single row in place - fastest + +# ### Allowing `missing` as well as `categorical` slows down computations + +using StatsBase + +function test(data) # uses countmap function to test performance + println(eltype(data)) + x = rand(data, 10^6) + y = categorical(x) + println(" raw:") + @btime countmap($x) + println(" categorical:") + @btime countmap($y) + nothing +end + +test(1:10) +test([randstring() for i in 1:10]) +test(allowmissing(1:10)) +test(allowmissing([randstring() for i in 1:10])) + + diff --git a/literate_notebooks/src/12_pitfalls.jl b/literate_notebooks/src/12_pitfalls.jl new file mode 100644 index 0000000..8eb5e79 --- /dev/null +++ b/literate_notebooks/src/12_pitfalls.jl @@ -0,0 +1,73 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** + +using DataFrames + +# ## Possible pitfalls + +#- + +# ### Know what is copied when creating a `DataFrame` + +x = DataFrame(rand(3, 5)) + +#- + +y = DataFrame(x) +x === y # no copyinng performed + +#- + +y = copy(x) +x === y # not the same object + +#- + +all(x[i] === y[i] for i in ncol(x)) # but the columns are the same + +#- + +x = 1:3; y = [1, 2, 3]; df = DataFrame(x=x,y=y) # the same when creating arrays or assigning columns, except ranges + +#- + +y === df[:y] # the same object + +#- + +typeof(x), typeof(df[:x]) # range is converted to a vector + +# ### Do not modify the parent of `GroupedDataFrame` + +x = DataFrame(id=repeat([1,2], outer=3), x=1:6) +g = groupby(x, :id) + +#- + +x[1:3, 1]=[2,2,2] +g # well - it is wrong now, g is only a view + +# ### Remember that you can filter columns of a `DataFrame` using booleans + +srand(1) +x = DataFrame(rand(5, 5)) + +#- + +x[x[:x1] .< 0.25] # well - we have filtered columns not rows by accident as you can select columns using booleans + +#- + +x[x[:x1] .< 0.25, :] # probably this is what we wanted + +# ### Column selection for DataFrame creates aliases unless explicitly copied + +x = DataFrame(a=1:3) +x[:b] = x[1] # alias +x[:c] = x[:, 1] # also alias +x[:d] = x[1][:] # copy +x[:e] = copy(x[1]) # explicit copy +display(x) +x[1,1] = 100 +display(x) + diff --git a/literate_notebooks/src/13_extras.jl b/literate_notebooks/src/13_extras.jl new file mode 100644 index 0000000..5140a31 --- /dev/null +++ b/literate_notebooks/src/13_extras.jl @@ -0,0 +1,198 @@ +# # Introduction to DataFrames +# **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 13, 2018** + +using DataFrames + +# ## Extras - selected functionalities of selected packages + +#- + +# ### FreqTables: creating cross tabulations + +using FreqTables +df = DataFrame(a=rand('a':'d', 1000), b=rand(["x", "y", "z"], 1000)) +ft = freqtable(df, :a, :b) # observe that dimensions are sorted if possible + +#- + +ft[1,1], ft['b', "z"] # you can index the result using numbers or names + +#- + +prop(ft, 1) # getting proportions - 1 means we want to calculate them in rows (first dimension) + +#- + +prop(ft, 2) # and columns are normalized to 1.0 now + +#- + +x = categorical(rand(1:3, 10)) +levels!(x, [3, 1, 2, 4]) # reordering levels and adding an extra level +freqtable(x) # order is preserved and not-used level is shown + +#- + +freqtable([1,1,2,3,missing]) # by default missings are listed + +#- + +freqtable([1,1,2,3,missing], skipmissing=true) # but we can skip them + +# ### DataFramesMeta - working on `DataFrame` + +using DataFramesMeta +df = DataFrame(x=1:8, y='a':'h', z=repeat([true,false], outer=4)) + +#- + +@with(df, :x+:z) # expressions with columns of DataFrame + +#- + +@with df begin # you can define code blocks + a = :x[:z] + b = :x[.!:z] + :y + [a; b] +end + +#- + +a # @with creates hard scope so variables do not leak out + +#- + +df2 = DataFrame(a = [:a, :b, :c]) +@with(df2, :a .== ^(:a)) # sometimes we want to work on raw Symbol, ^() escapes it + +#- + +df2 = DataFrame(x=1:3, y=4:6, z=7:9) +@with(df2, _I_(2:3)) # _I_(expression) is translated to df2[expression] + +#- + +@where(df, :x .< 4, :z .== true) # very useful macro for filtering + +#- + +@select(df, :x, y = 2*:x, z=:y) # create a new DataFrame based on the old one + +#- + +@transform(df, a=1, x = 2*:x, y=:x) # create a new DataFrame adding columns based on the old one + +#- + +@transform(df, a=1, b=:a) # old DataFrame is used and :a is not present there + +#- + +@orderby(df, :z, -:x) # sorting into a new data frame, less powerful than sort, but lightweight + +#- + +@linq df |> # chaining of operations on DataFrame + where(:x .< 5) |> + orderby(:z) |> + transform(x²=:x.^2) |> + select(:z, :x, :x²) + +#- + +f(df, col) = df[col] # you can define your own functions and put them in the chain +@linq df |> where(:x .<= 4) |> f(:x) + +# ### DataFramesMeta - working on grouped `DataFrame` + +df = DataFrame(a = 1:12, b = repeat('a':'d', outer=3)) +g = groupby(df, :b) + +#- + +@by(df, :b, first=first(:a), last=last(:a), mean=mean(:a)) # more convinient than by from DataFrames + +#- + +@based_on(g, first=first(:a), last=last(:a), mean=mean(:a)) # the same as by but on grouped DataFrame + +#- + +@where(g, mean(:a) > 6.5) # filter gropus on aggregate conditions + +#- + +@orderby(g, -sum(:a)) # order groups on aggregate conditions + +#- + +@transform(g, center = mean(:a), centered = :a - mean(:a)) # perform operations within a group and return ungroped DataFrame + +#- + +DataFrame(g) # a nice convinience function not defined in DataFrames + +#- + +@transform(g) # actually this is the same + +#- + +@linq df |> groupby(:b) |> where(mean(:a) > 6.5) |> DataFrame # you can do chaining on grouped DataFrames as well + +# ### DataFramesMeta - rowwise operations on `DataFrame` + +df = DataFrame(a = 1:12, b = repeat(1:4, outer=3)) + +#- + +## such conditions are often needed but are complex to write +@transform(df, x = ifelse.((:a .> 6) .& (:b .== 4), "yes", "no")) + +#- + +## one option is to use a function that works on a single observation and broadcast it +myfun(a, b) = a > 6 && b == 4 ? "yes" : "no" +@transform(df, x = myfun.(:a, :b)) + +#- + +## or you can use @byrow! macro that allows you to process DataFrame rowwise +@byrow! df begin + @newcol x::Vector{String} + :x = :a > 6 && :b == 4 ? "yes" : "no" +end + +# ### Visualizing data with StatPlots + +using StatPlots # you might need to setup Plots package and some plotting backend first + +#- + +## we present only a minimal functionality of the package + +#- + +srand(1) +df = DataFrame(x = sort(randn(1000)), y=randn(1000), z = [fill("b", 500); fill("a", 500)]) + +#- + +@df df plot(:x, :y, legend=:topleft, label="y(x)") # a most basic plot + +#- + +@df df density(:x, label="") # density plot + +#- + +@df df histogram(:y, label="y") # and a histogram + +#- + +@df df boxplot(:z, :x, label="x") + +#- + +@df df violin(:z, :y, label="y") + From 53433ec936fa5aeb13598ed608324bbf864064b3 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sat, 17 Apr 2021 21:43:10 -0500 Subject: [PATCH 02/24] traslation to spanish README --- literate_notebooks/src-ES/README.md | 147 ++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 literate_notebooks/src-ES/README.md diff --git a/literate_notebooks/src-ES/README.md b/literate_notebooks/src-ES/README.md new file mode 100644 index 0000000..4a98762 --- /dev/null +++ b/literate_notebooks/src-ES/README.md @@ -0,0 +1,147 @@ +# Una Introducción a DataFrames + +[Bogumił Kamiński](http://bogumilkaminski.pl/about/), November 2020, 2020 +(Traducción por Miguel Raz Guzmán Macedo Abril 2021) + +**Este tutorial es para DataFrames 0.22.0** + +Una breve introducción al uso de los [DataFrames](https://github.com/JuliaData/DataFrames.jl). + +Este tutorial contiene una especificación de la versión del proyecto bajo el cual +debería correr. Para preparar este ambiente, antes de usar los notebooks, hay +que correr la siguiente línea en el folder del proyecto: + +``` +julia -e 'using Pkg; Pkg.activate("."); Pkg.instantiate()' +``` + +Corrido en Julia 1.5.3. Las dependencias del proyecto son las siguientes: + +``` + [69666777] Arrow v1.0.1 + [6e4b80f9] BenchmarkTools v0.5.0 + [336ed68f] CSV v0.8.2 + [324d7699] CategoricalArrays v0.9.0 + [944b1d66] CodecZlib v0.7.0 + [a93c6f00] DataFrames v0.22.1 + [1313f7d8] DataFramesMeta v0.6.0 + [5789e2e9] FileIO v1.4.4 + [da1fdf0e] FreqTables v0.4.2 + [7073ff75] IJulia v1.23.0 + [babc3d20] JDF v0.2.20 + [9da8a3cd] JLSO v2.4.0 + [b9914132] JSONTables v1.0.0 + [86f7a689] NamedArrays v0.9.4 + [b98c9c47] Pipe v1.3.0 + [2dfb63ee] PooledArrays v0.5.3 + [f3b207a7] StatsPlots v0.14.17 + [bd369af6] Tables v1.2.1 + [a5390f91] ZipFile v0.9.3 + [9a3f8284] Random + [10745b16] Statistics +``` + +I will try to keep the material up to date as the packages evolve. + +Este tutorial cubre +[DataFrames](https://github.com/JuliaData/DataFrames.jl) +y [CategoricalArrays](https://github.com/JuliaData/CategoricalArrays.jl), +pues constituyen la mayor parte de [DataFrames](https://github.com/JuliaData/DataFrames.jl) +en conjunto con otras paqueterías específicas para leer y escribir archivos. + +En los [extras](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/13_extras.ipynb) +se da mención *especial* de funcionalidades *selectas* sobre paqueterías que he encontrado ser +particularmente útiles para la manipulación de datos. Esos paquetes son: +[FreqTables](https://github.com/nalimilan/FreqTables.jl), +[DataFramesMeta](https://github.com/JuliaStats/DataFramesMeta.jl) (depende del DataFrames.jl 0.22 release), +[StatsPlots](https://github.com/JuliaPlots/StatsPlots.jl). + +# Inicializando el Jupyter Notebook para que funcione con DataFrames.jl + +Por default los notebooks de Jupyter limitarán las filas y columnas cuando +muestren un data frame para que quepa en la pantalla (similar al REPL). + + +Puedes cambiar esto fijando las variables`ENV["COLUMNS"]` o `ENV["LINES"]` +para que contengan la dimensiones máximas del output de caractéres al correr en notebook. +Alternativamente, se puede agregar `"COLUMNS": "1000", "LINES": "100"` a la varaible `"env"` variable +en el archivo kernel de Jupyter. Ver [aquí](https://jupyter-client.readthedocs.io/en/stable/kernels.html) +para más información sobre la ubicación y especificación de los kernels de Jupyter. + +# Tabla de Contenidos + +| Archivo | Tema | +|-------------------------------------------------------------------------------------------------------------------|-----------------------------------| +| [01_constructors.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/01_constructors.ipynb) | Creating DataFrame and conversion | +| [02_basicinfo.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/02_basicinfo.ipynb) | Getting summary information | +| [03_missingvalues.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/03_missingvalues.ipynb) | Handling missing values | +| [04_loadsave.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/04_loadsave.ipynb) | Loading and saving DataFrames | +| [05_columns.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/05_columns.ipynb) | Working with columns of DataFrame | +| [06_rows.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/06_rows.ipynb) | Working with row of DataFrame | +| [07_factors.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/07_factors.ipynb) | Working with categorical data | +| [08_joins.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/08_joins.ipynb) | Joining DataFrames | +| [09_reshaping.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/09_reshaping.ipynb) | Reshaping DataFrames | +| [10_transforms.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/10_transforms.ipynb) | Transforming DataFrames | +| [11_performance.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/11_performance.ipynb) | Performance tips | +| [12_pitfalls.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/12_pitfalls.ipynb) | Possible pitfalls | +| [13_extras.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/13_extras.ipynb) | Additional interesting packages | + +Changelog: + +| Date | Changes | +| ---------- | ------------------------------------------------------------ | +| 2017-12-05 | Initial release | +| 2017-12-06 | Added description of `insert!`, `merge!`, `empty!`, `categorical!`, `delete!`, `DataFrames.index` | +| 2017-12-09 | Added performance tips | +| 2017-12-10 | Added pitfalls | +| 2017-12-18 | Added additional worthwhile packages: *FreqTables* and *DataFramesMeta* | +| 2017-12-29 | Added description of `filter` and `filter!` | +| 2017-12-31 | Added description of conversion to `Matrix` | +| 2018-04-06 | Added example of extracting a row from a `DataFrame` | +| 2018-04-21 | Major update of whole tutorial | +| 2018-05-01 | Added `byrow!` example | +| 2018-05-13 | Added `StatPlots` package to extras | +| 2018-05-23 | Improved comments in sections 1 do 5 by [Jane Herriman](https://github.com/xorJane) | +| 2018-07-25 | Update to 0.11.7 release | +| 2018-08-25 | Update to Julia 1.0 release: sections 1 to 10 | +| 2018-08-29 | Update to Julia 1.0 release: sections 11, 12 and 13 | +| 2018-09-05 | Update to Julia 1.0 release: FreqTables section | +| 2018-09-10 | Added CSVFiles section to chapter on load/save | +| 2018-09-26 | Updated to DataFrames 0.14.0 | +| 2018-10-04 | Updated to DataFrames 0.14.1, added `haskey` and `repeat` | +| 2018-12-08 | Updated to DataFrames 0.15.2 | +| 2019-01-03 | Updated to DataFrames 0.16.0, added serialization instructions | +| 2019-01-18 | Updated to DataFrames 0.17.0, added `passmissing` | +| 2019-01-27 | Added Feather.jl file read/write | +| 2019-01-30 | Renamed StatPlots.jl to StatsPlots.jl and added Tables.jl| +| 2019-02-08 | Added `groupvars` and `groupindices` functions| +| 2019-04-27 | Updated to DataFrames 0.18.0, dropped JLD2.jl | +| 2019-04-30 | Updated handling of missing values description | +| 2019-07-16 | Updated to DataFrames 0.19.0 | +| 2019-08-14 | Added JSONTables.jl and `Tables.columnindex` | +| 2019-08-16 | Added Project.toml and Manifest.toml | +| 2019-08-26 | Update to Julia 1.2 and DataFrames 0.19.3 | +| 2019-08-29 | Add example how to compress/decompress CSV file using CodecZlib | +| 2019-08-30 | Add examples of JLSO.jl and ZipFile.jl by [xiaodaigh](https://github.com/xiaodaigh) | +| 2019-11-03 | Add examples of JDF.jl by [xiaodaigh](https://github.com/xiaodaigh) | +| 2019-12-08 | Updated to DataFrames 0.20.0 | +| 2020-05-06 | Updated to DataFrames 0.21.0 (except load/save and extras) | +| 2020-11-20 | Updated to DataFrames 0.22.0 (except DataFramesMeta.jl which does not work yet) | +| 2020-11-26 | Updated to DataFramesMeta.jl 0.6; update by @pdeffebach | + +# Resumen de funciones clave: + +1. Constructors: `DataFrame`, `DataFrame!`, `Tables.rowtable`, `Tables.columntable`, `Matrix`, `eachcol`, `eachrow`, `Tables.namedtupleiterator`, `empty`, `empty!` +2. Descripciones: `size`, `nrow`, `ncol`, `describe`, `names`, `eltypes`, `first`, `last`, `getindex`, `setindex!`, `@view`, `isapprox` +3. Manejo de missing: `missing` (singleton instance of `Missing`), `ismissing`, `nonmissingtype`, `skipmissing`, `replace`, `replace!`, `coalesce`, `allowmissing`, `disallowmissing`, `allowmissing!`, `completecases`, `dropmissing`, `dropmissing!`, `disallowmissing`, `disallowmissing!`, `passmissing` +4. Cargando y guardando archivos: `CSV` (package), `CSVFiles` (package), `Serialization` (module), `CSV.read`, `CSV.write`, `save`, `load`, `serialize`, `deserialize`, `Arrow.write`, `Arrow.Table` (from Arrow.jl package), `JSONTables` (package), `arraytable`, `objecttable`, `jsontable`, `CodecZlib` (module), `GzipCompressorStream`, `GzipDecompressorStream`, `JDF.jl` (package), `JDF.savejdf`, `JDF.loadjdf`, `JLSO.jl` (package), `JLSO.save`, `JLSO.load`, `ZipFile.jl` (package), `ZipFile.reader`, `ZipFile.writer`, `ZipFile.addfile` +5. Trabajando con columnas: `rename`, `rename!`, `hcat`, `insertcols!`, `categorical!`, `columnindex`, `hasproperty`, `select`, `select!`, `transform`, `transform!`, `combine`, `Not`, `All`, `Between`, `ByRow`, `AsTable` +6. Trabajando con filas: `sort!`, `sort`, `issorted`, `append!`, `vcat`, `push!`, `view`, `filter`, `filter!`, `delete!`, `unique`, `nonunique`, `unique!`, `repeat`, `parent`, `parentindices`, `flatten`, `@pipe` (from `Pipe` package), `only` +7. Trabajando con datos categóricos: `categorical`, `cut`, `isordered`, `ordered!`, `levels`, `unique`, `levels!`, `droplevels!`, `get`, `recode`, `recode!` +8. Joins: `innerjoin`, `leftjoin`, `rightjoin`, `outerjoin`, `semijoin`, `antijoin`, `crossjoin` +9. Reorganizando: `stack`, `unstack` +10. Transformadas: `groupby`, `mapcols`, `parent`, `groupcols`, `valuecols`, `groupindices`, `keys` (for `GroupedDataFrame`), `combine`, `select`, `select!`, `transform`, `transform!`, `@pipe` (from `Pipe` package) +11. Extras: + * [FreqTables](https://github.com/nalimilan/FreqTables.jl): `freqtable`, `prop`, `Name` + * [DataFramesMeta](https://github.com/JuliaStats/DataFramesMeta.jl): `@with`, `@where`, `@select`, `@transform`, `@orderby`, `@linq`, `@by`, `@combine`, `@eachrow`, `@newcol`, `^`, `cols` + * [StatsPlots](https://github.com/JuliaPlots/StatsPlots.jl): `@df`, `plot`, `density`, `histogram`,`boxplot`, `violin` From 88f18ed136a0beaae7fc61c0df1c49dfc6c57824 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sat, 17 Apr 2021 22:08:02 -0500 Subject: [PATCH 03/24] translation to spanish 02 --- literate_notebooks/src-ES/01_constructors.jl | 76 ++++++++++---------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/literate_notebooks/src-ES/01_constructors.jl b/literate_notebooks/src-ES/01_constructors.jl index 333a81e..62ffa39 100644 --- a/literate_notebooks/src-ES/01_constructors.jl +++ b/literate_notebooks/src-ES/01_constructors.jl @@ -1,104 +1,105 @@ -# # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** +# (Traducción por Miguel Raz, Abril 16 2021) # -# Let's get started by loading the `DataFrames` package. +# Empecemos cargando el paquete de `DataFrames`. using DataFrames -# ## Constructors and conversion +# ## Constructores y conversiones #- -# ### Constructors +# ### Constructores # -# In this section, you'll see many ways to create a `DataFrame` using the `DataFrame()` constructor. +# En esta secciónn, verás distintas maneras de crear un `DataFrame` usando el constructor `DataFrame()`. # -# First, we could create an empty DataFrame, +# Primero, creemos un DataFrame vacío. -DataFrame() # empty DataFrame +DataFrame() # DataFrame vacío -# Or we could call the constructor using keyword arguments to add columns to the `DataFrame`. +# O podemos llamar al constructor usando keyword arguments para agregar columnas al `DataFrame`. DataFrame(A=1:3, B=rand(3), C=randstring.([3,3,3])) -# We can create a `DataFrame` from a dictionary, in which case keys from the dictionary will be sorted to create the `DataFrame` columns. +# Podemos creat el `DataFrame` de un diccionario, en cuyo caso las llaves del diccionario estarán ordenadas para crear las columnas del `DataFrame`. x = Dict("A" => [1,2], "B" => [true, false], "C" => ['a', 'b']) DataFrame(x) -# Rather than explicitly creating a dictionary first, as above, we could pass `DataFrame` arguments with the syntax of dictionary key-value pairs. +# En vez de explícitamente crear el diccionario primero, como hicimos arriba, podríamos pasar argumentos de `DataFrame` con la sintaxis de pares llave-valor de un diccionario. # -# Note that in this case, we use symbols to denote the column names and arguments are not sorted. For example, `:A`, the symbol, produces `A`, the name of the first column here: +# Notar que en este caso, usamos símbolos para denotar los nombres de las columnas y los argumentos no están ordenados. Por ejemplo, `:A`, el símbolo, produce `A`, el nombre de la primera columna: DataFrame(:A => [1,2], :B => [true, false], :C => ['a', 'b']) -# Here we create a `DataFrame` from a vector of vectors, and each vector becomes a column. +# Aquí creamos un `DataFrame` de un vector de vectores, donde cada vector se convierte en una columna. DataFrame([rand(3) for i in 1:3]) -# For now we can construct a single `DataFrame` from a `Vector` of atoms, creating a `DataFrame` with a single row. In future releases of DataFrames.jl, this will throw an error. +# Por ahora podemos construir un `DataFrame` de un `Vector` de átomos, creando un `DataFrame` con una sola hilera. En versiones futuras de DataFrames.jl, esto arrojará un error. DataFrame(rand(3)) -# Instead use a transposed vector if you have a vector of atoms (in this way you effectively pass a two dimensional array to the constructor which is supported). +# Si tienes un vector de átomos, es mejor usar un vector transpuesto (pues efectivamente uno pasa un arreglo bidimensional, lo cual sí tiene soporte.) DataFrame(transpose([1, 2, 3])) -# Pass a second argument to give the columns names. +# Pasa un segundo argumento para darle nombre a las columnas. DataFrame([1:3, 4:6, 7:9], [:A, :B, :C]) -# Here we create a `DataFrame` from a matrix, +# Aquí creamos un `DataFrame` de una matriz, DataFrame(rand(3,4)) -# and here we do the same but also pass column names. +# y aquí hacemos lo mismo pero también pasamos los nombres de las columnas. DataFrame(rand(3,4), Symbol.('a':'d')) -# We can also construct an uninitialized DataFrame. +# También podemos pasar un DataFrame no inicializado. # -# Here we pass column types, names and number of rows; we get `missing` in column :C because `Any >: Missing`. +# Aquí pasamos los tipos de las columnas, nombres y número de hileras; obtemenos un `missing` en la columna :C porque `Any >: Missing`: DataFrame([Int, Float64, Any], [:A, :B, :C], 1) -# Here we create a `DataFrame`, but column `:C` is #undef and Jupyter has problem with displaying it. (This works OK at the REPL.) +# Aquí nosotros creamos un `DataFrame`, pero la columna `:C` es `#undef` y Jupyter tiene problemas para mostrarlo (Esto funciona en el REPL sin problemas.) # -# This will be fixed in next release of DataFrames! DataFrame([Int, Float64, String], [:A, :B, :C], 1) -# To initialize a `DataFrame` with column names, but no rows use +# Para inicializar un `DataFrame` con nombres de columnas, pero sin filas, usamos DataFrame([Int, Float64, String], [:A, :B, :C], 0) -# This syntax gives us a quick way to create homogenous `DataFrame`. +# Esta sintaxis nos da una manera rápida de crear un `DataFrame` homogéneo. DataFrame(Int, 3, 5) -# This example is similar, but has nonhomogenous columns. +# El ejemplo es similar, pero tiene columnas no-homogéneas. DataFrame([Int, Float64], 4) -# Finally, we can create a `DataFrame` by copying an existing `DataFrame`. +# Finalmente, podemos creat un `DataFrame` copiando uno anterior. # -# Note that `copy` creates a shallow copy. +# Notar que `copy` crea un copia superficial. y = DataFrame(x) z = copy(x) (x === y), (x === z), isequal(x, z) -# ### Conversion to a matrix +# ### Conversión a matrices # # Let's start by creating a `DataFrame` with two rows and two columns. +# Empecemos creando un `DataFrame` con dos filas y dos columnas. x = DataFrame(x=1:2, y=["A", "B"]) -# We can create a matrix by passing this `DataFrame` to `Matrix`. +# Podemos crear una matriz pasando este `DataFrame` a `Matrix`. Matrix(x) -# This would work even if the `DataFrame` had some `missing`s: +# Este funciona aún si el `DataFrame` tiene `missing`s: x = DataFrame(x=1:2, y=[missing,"B"]) @@ -106,7 +107,7 @@ x = DataFrame(x=1:2, y=[missing,"B"]) Matrix(x) -# In the two previous matrix examples, Julia created matrices with elements of type `Any`. We can see more clearly that the type of matrix is inferred when we pass, for example, a `DataFrame` of integers to `Matrix`, creating a 2D `Array` of `Int64`s: +# En los dos ejemplos de matrices pasados, Julia creó matrices con elementos de tipo `Any`. Podemos ver más claramente qué tipo de matriz es inferido cuando pasamos, por ejemplo, un `DataFrame` de enteros a `Matrix`, creando un arreglo 2D de `Int64`s: x = DataFrame(x=1:2, y=3:4) @@ -114,7 +115,8 @@ x = DataFrame(x=1:2, y=3:4) Matrix(x) -# In this next example, Julia correctly identifies that `Union` is needed to express the type of the resulting `Matrix` (which contains `missing`s). +# En el próximo ejemplo, Julia correctamente identifica que el tipo `Union` se necesita para expresar el tipo resultante de `Matrix` (el cual contiene `missing`s). + x = DataFrame(x=1:2, y=[missing,4]) @@ -122,22 +124,22 @@ x = DataFrame(x=1:2, y=[missing,4]) Matrix(x) -# Note that we can't force a conversion of `missing` values to `Int`s! +# ¡Notemos que no podemos covertir forzosamente valores `missings` a `Int`s! Matrix{Int}(x) -# ### Handling of duplicate column names +# ### Lidiando con nombres de columnas repetidos # -# We can pass the `makeunique` keyword argument to allow passing duplicate names (they get deduplicated) +# Podemos pasar el keyword argument `makeunique` para permitir usar nombres duplicados (se desduplican) df = DataFrame(:a=>1, :a=>2, :a_1=>3; makeunique=true) -# Otherwise, duplicates will not be allowed in the future. +# Si no es así, los duplicados no se permitirán después. df = DataFrame(:a=>1, :a=>2, :a_1=>3) -# A constructor that is passed column names as keyword arguments is a corner case. -# You cannot pass `makeunique` to allow duplicates here. +# Una excepción es un constructor al que se le pasan los nombres de columnas como keyword arguments. +# No puedes pasar `makeunique` para permitir duplicados en este caso. df = DataFrame(a=1, a=2, makeunique=true) From cf1fcf7b2df0ab425dea64de4acca92fe880c7c7 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sat, 17 Apr 2021 22:20:35 -0500 Subject: [PATCH 04/24] translation to spanish 02_basicinfo.jl --- literate_notebooks/src-ES/02_basicinfo.jl | 41 ++++++++++++----------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/literate_notebooks/src-ES/02_basicinfo.jl b/literate_notebooks/src-ES/02_basicinfo.jl index 6cde7c6..3801219 100644 --- a/literate_notebooks/src-ES/02_basicinfo.jl +++ b/literate_notebooks/src-ES/02_basicinfo.jl @@ -1,75 +1,78 @@ # # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** +# Traducción por Miguel Raz, Abril 17, 2021 -using DataFrames # load package +using DataFrames # cargar el paquete -# ## Getting basic information about a data frame +# ## Obteniendo información básica de un DataFrame +# # -# Let's start by creating a `DataFrame` object, `x`, so that we can learn how to get information on that data frame. +# Empecemos creando un objeto `DataFrame`, llamado `x`, para que podamos aprender como sacarle información. x = DataFrame(A = [1, 2], B = [1.0, missing], C = ["a", "b"]) -# The standard `size` function works to get dimensions of the `DataFrame`, +# La función estándar `size` nos dice las dimensiones del `DataFrame`, size(x), size(x, 1), size(x, 2) -# as well as `nrow` and `ncol` from R; `length` gives number of columns. +# y al igual que `nrow y `ncol` de R; `length`; nos da el número de columnas. nrow(x), ncol(x), length(x) -# `describe` gives basic summary statistics of data in your `DataFrame`. +# `describe` nos da estadísticas descriptivas básicas de nuestro `DataFrame`. describe(x) -# Use `showcols` to get informaton about columns stored in a DataFrame. +# Usa `showcols` para obetner información sobre columnas guardadas en un DataFrame. showcols(x) -# `names` will return the names of all columns, +# `names` regresa el nombre de todas las columnas, names(x) -# and `eltypes` returns their types. +# y `eltypes` el de sus tipos. eltypes(x) -# Here we create some large DataFrame +# Aquí creamos un DataFrame más grande y = DataFrame(rand(1:10, 1000, 10)); -# and then we can use `head` to peek into its top rows +# y usamos `head` para asomarnos a sus primeras filas head(y) -# and `tail` to see its bottom rows. +# y `tail` para sus últimas filas. tail(y, 3) -# ### Most elementary get and set operations +# ### Operaciones elementales para asignar y sacar # -# Given the `DataFrame`, `x`, here are three ways to grab one of its columns as a `Vector`: +# Dado un objeto DataFrame llamado `x`, aquí hay 3 maneras de tomar una de sus columnas como un `Vector`: x[1], x[:A], x[:, 1] -# To grab one row as a DataFrame, we can index as follows. +# Para tomar una hilera de un DataFrame, lo indexamos como sigue x[1, :] -# We can grab a single cell or element with the same syntax to grab an element of an array. +# Podemos agarrar una sola celda o elemento con la misma sintaxis que usamos para elementos de arreglos. x[1, 1] -# Assignment can be done in ranges to a scalar, +# Asignar también se puede hacer con rangos a un escalar, x[1:2, 1:2] = 1 x -# to a vector of length equal to the number of assigned rows, +# a un vector de longitud igual al número de filas asignadas, x[1:2, 1:2] = [1,2] x -# or to another data frame of matching size. +# o a otro DataFrame de tamaño igual. x[1:2, 1:2] = DataFrame([5 6; 7 8]) x From 9b82f62caa1251e0ae1e69c399394d3d064d9dff Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sat, 17 Apr 2021 22:43:39 -0500 Subject: [PATCH 05/24] translation to spanish 03_missingvalues.jl --- literate_notebooks/src-ES/03_missingvalues.jl | 55 ++++++++++--------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/literate_notebooks/src-ES/03_missingvalues.jl b/literate_notebooks/src-ES/03_missingvalues.jl index 1e17d97..3ef9776 100644 --- a/literate_notebooks/src-ES/03_missingvalues.jl +++ b/literate_notebooks/src-ES/03_missingvalues.jl @@ -1,41 +1,44 @@ # # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** +# (Traducción de Miguel Raz Guzmán Macedo) -using DataFrames # load package +using DataFrames # cargar paquete -# ## Handling missing values +# ## Manejo de valores faltantes # -# A singelton type `Missings.Missing` allows us to deal with missing values. +# Un tipo singulete `Missings.Missing` permite lidiar con valores faltantes missing, typeof(missing) -# Arrays automatically create an appropriate union type. +# Los arreglos automaticamente crean una unión de tipos apropiada. x = [1, 2, missing, 3] -# `ismissing` checks if passed value is missing. +# `ismissing` checa si se le pasa un valor faltante. ismissing(1), ismissing(missing), ismissing(x), ismissing.(x) -# We can extract the type combined with Missing from a `Union` via +# Podemos extrar el tipo combinado con Missing de una `Union` via # -# (This is useful for arrays!) +# (¡Esto es muy útil para los arreglos!) eltype(x), Missings.T(eltype(x)) -# `missing` comparisons produce `missing`. +# comparaciones de `missing` producen `missing`. missing == missing, missing != missing, missing < missing -# This is also true when `missing`s are compared with values of other types. +# Eso también es cierto cuando `missing`s se comparan con valores de otros tipos. 1 == missing, 1 != missing, 1 < missing -# `isequal`, `isless`, and `===` produce results of type `Bool`. +# `isequal`, `isless`, y `===` producen resultados de tipo `Bool`. +# isequal(missing, missing), missing === missing, isequal(1, missing), isless(1, missing) -# In the next few examples, we see that many (not all) functions handle `missing`. +# En los próximos ejemplos, vemos que muchas (no todas) funciones manejan `missing`. map(x -> x(missing), [sin, cos, zero, sqrt]) # part 1 @@ -47,65 +50,65 @@ map(x -> x(missing, 1), [+, - , *, /, div]) # part 2 map(x -> x([1,2,missing]), [minimum, maximum, extrema, mean, any, float]) # part 3 -# `skipmissing` returns iterator skipping missing values. We can use `collect` and `skipmissing` to create an array that excludes these missing values. +# `skipmissing` regresa un iterador que salta valores faltantes. Podemos usar `collect` y `skipmissing` para crear un arreglo que excluye estos valores faltantes. collect(skipmissing([1, missing, 2, missing])) -# Similarly, here we combine `collect` and `Missings.replace` to create an array that replaces all missing values with some value (`NaN` in this case). +# Similarmente, aquí combinamos `collect` y `Missings.replace` para crear un arreglo que reemplaza todos los valores faltantes con algún valor (`NaN`, en este caso). collect(Missings.replace([1.0, missing, 2.0, missing], NaN)) -# Another way to do this: +# Otra manera de hacer esto es: coalesce.([1.0, missing, 2.0, missing], NaN) -# Caution: `nothing` would also be replaced here (for Julia 0.7 a more sophisticated behavior of `coalesce` that allows to avoid this problem is planned). +# Cuidado: `nothing` también sería reemplazado aquí (Para Julia 0.7 un comportamiento más sofisticado de `coalesce` permite que esquivemos este problema.) coalesce.([1.0, missing, nothing, missing], NaN) -# You can use `recode` if you have homogenous output types. +# Puedes usar `recode` si tienes tipos homogéneos en el output. recode([1.0, missing, 2.0, missing], missing=>NaN) -# You can use `unique` or `levels` to get unique values with or without missings, respectively. +# Puedes usar `unique` o `levels` para obtener valores únicos con o sin missings, respectivamente. unique([1, missing, 2, missing]), levels([1, missing, 2, missing]) -# In this next example, we convert `x` to `y` with `allowmissing`, where `y` has a type that accepts missings. +# En este ejemplo, convertimos `x` a `y` con `allowmissing`, donde `y` tiene un tipo que acepta missings. x = [1,2,3] y = allowmissing(x) -# Then, we convert back with `disallowmissing`. This would fail if `y` contained missing values! +# Después, lo convertimos de regreso con `disallowmissing`. ¡Esto falla si `y` contiene valores faltantes! z = disallowmissing(y) x,y,z -# In this next example, we show that the type of each column in `x` is initially `Int64`. After using `allowmissing!` to accept missing values in columns 1 and 3, the types of those columns become `Union`s of `Int64` and `Missings.Missing`. +# En el próximo ejemplo, mostramos que el tipo de cada columna de `x` es inicialmente `Int64`. Después de usar `allowmissing!`, aceptamos valores faltantes en las columnas 1 y 3. Los tipos de esas columnas se convierten en `Union`es de `Int64` y `Missings.Missing`. x = DataFrame(Int, 2, 3) println("Before: ", eltypes(x)) -allowmissing!(x, 1) # make first column accept missings -allowmissing!(x, :x3) # make :x3 column accept missings +allowmissing!(x, 1) # Hacer que la primera columna permita valores faltantes +allowmissing!(x, :x3) # Hacer que la columna :x3 acepte valores faltantes println("After: ", eltypes(x)) -# In this next example, we'll use `completecases` to find all the rows of a `DataFrame` that have complete data. +# En este ejemplo, usaremos `completecases` para encontrar todas las hileras de un `DataFrame` que tengan datos completos. x = DataFrame(A=[1, missing, 3, 4], B=["A", "B", missing, "C"]) println(x) println("Complete cases:\n", completecases(x)) -# We can use `dropmissing` or `dropmissing!` to remove the rows with incomplete data from a `DataFrame` and either create a new `DataFrame` or mutate the original in-place. +# Podemos usar `dropmissing` o `dropmissing!` para quitar las filas con datos incompletos de un `DataFrame` y crear un nuevo `DataFrame` o mutar el original en su lugar. y = dropmissing(x) dropmissing!(x) [x, y] -# When we call `showcols` on a `DataFrame` with dropped missing values, the columns still allow missing values. +# Cuando llamamos `showcols` en un `DataFrame` con valores faltantes que les hicimos `drop`, las columnes siguen permitiendo valores faltantes. showcols(x) -# Since we've excluded missing values, we can safely use `disallowmissing!` so that the columns will no longer accept missing values. +# Como ya excluimos los valores faltantes, podemos usar `disallowmissing!` con seguridad para que las columnas ya no puedan aceptar valores faltantes. disallowmissing!(x) showcols(x) From 51662a6ed807dc415c2115913e7ae000723dab04 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sat, 17 Apr 2021 22:57:03 -0500 Subject: [PATCH 06/24] translation to spanish 04_loadsave.jl --- literate_notebooks/src-ES/04_loadsave.jl | 36 +++++++++++++----------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/literate_notebooks/src-ES/04_loadsave.jl b/literate_notebooks/src-ES/04_loadsave.jl index d166830..9607057 100644 --- a/literate_notebooks/src-ES/04_loadsave.jl +++ b/literate_notebooks/src-ES/04_loadsave.jl @@ -1,64 +1,66 @@ -# # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** +# (Traducción de Miguel Raz Guzmán Macedo) -using DataFrames # load package +using DataFrames # cargar paquete -# ## Load and save DataFrames -# We do not cover all features of the packages. Please refer to their documentation to learn them. +# ## Cargar y guardar DataFrames +# No cubriremos toda la funcionalidad de los paquetes. Por favor leer la documentación para saber más. # -# Here we'll load `CSV` to read and write CSV files and `JLD`, which allows us to work with a Julia native binary format. +# Aquí cargaremos un `CSV` para leer y escribir archivos CSV. `JLD` nos permite trabajar con un formato binario nativo de Julia. using CSV using JLD -# Let's create a simple `DataFrame` for testing purposes, +# Creemos un `DataFrame` para ver casos sencillos, x = DataFrame(A=[true, false, true], B=[1, 2, missing], C=[missing, "b", "c"], D=['a', missing, 'c']) -# and use `eltypes` to look at the columnwise types. +# y usaremos `eltypes` para ver los tipos columnares. eltypes(x) -# Let's use `CSV` to save `x` to disk; make sure `x.csv` does not conflict with some file in your working directory. +# usemos `CSV` para guardar `x` a disco; Asegurarse que `x.csv` no genera conflictos con archivos en su directorio actual. CSV.write("x.csv", x) # Now we can see how it was saved by reading `x.csv`. +# Ahora vemos como se puede guardó al leer `x.csv`. print(read("x.csv", String)) -# We can also load it back. `use_mmap=false` disables memory mapping so that on Windows the file can be deleted in the same session. +# También lo podemos cargar de regreso. `use_mmap=false` desabilita el uso de `memory mapping` para los archivos se puedan borrar en la misma sesión en Windows. y = CSV.read("x.csv", use_mmap=false) -# When loading in a `DataFrame` from a `CSV`, all columns allow `Missing` by default. Note that the column types have changed! +# Cuando cargas un `DataFrame` de un `CSV`, todas las columnas permiten `Missing` por default. ¡Nota que el tipo de las columnas cambió! eltypes(y) -# Now let's save `x` to a file in a binary format; make sure that `x.jld` does not exist in your working directory. +# Ahora guardemos `x` a un archivo en formato binario. Asegúrese que `x.jld` no existe en su directorio actual. save("x.jld", "x", x) -# After loading in `x.jld` as `y`, `y` is identical to `x`. +# Después de cargar `x.jld` como `y`, `y` es idéntico a `x`. y = load("x.jld", "x") -# Note that the column types of `y` are the same as those of `x`! +# Observación: ¡los tipos de columnas de `y` son del mismo tipo que `x`! eltypes(y) -# Next, we'll create the files `bigdf.csv` and `bigdf.jld`, so be careful that you don't already have these files on disc! +# Ahora, crearemos los archivos `bigdf.csv` y `bigdf.jld`, entonces, ¡cuidado con que no existan esos archivos actualmente en su disco! # -# In particular, we'll time how long it takes us to write a `DataFrame` with 10^3 rows and 10^5 columns to `.csv` and `.jld` files. *You can expect JLD to be faster!* Use `compress=true` to reduce file sizes. +# En particular, mediremos el tiempo que toma escribir un `DataFrame` con 10^3 filas y 10^5 columnas a archivos `.csv` y `.jld`. *Puedes esperar que JLD sea más rápido*. Usa `compress=true` para reducir el tamaño de los archivos. bigdf = DataFrame(Bool, 10^3, 10^2) @time CSV.write("bigdf.csv", bigdf) @time save("bigdf.jld", "bigdf", bigdf) getfield.(stat.(["bigdf.csv", "bigdf.jld"]), :size) -# Finally, let's clean up. Do not run the next cell unless you are sure that it will not erase your important files. +# Finalmente, hay que hacer algo de limpieza. No corras la siguiente línea a menos que estés segura que no va a borrar archivos importantes. -foreach(rm, ["x.csv", "x.jld", "bigdf.csv", "bigdf.jld"]) +#foreach(rm, ["x.csv", "x.jld", "bigdf.csv", "bigdf.jld"]) From 80375c027ce120cdefe6dc991b5366a8c55fb0ff Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sat, 17 Apr 2021 23:19:20 -0500 Subject: [PATCH 07/24] translation to spanish 05_columns.jl --- literate_notebooks/src-ES/05_columns.jl | 95 +++++++++++++------------ 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/literate_notebooks/src-ES/05_columns.jl b/literate_notebooks/src-ES/05_columns.jl index f32e02a..377041d 100644 --- a/literate_notebooks/src-ES/05_columns.jl +++ b/literate_notebooks/src-ES/05_columns.jl @@ -1,187 +1,188 @@ -# # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** +# (Traducción por Miguel Raz Guzmán Macedo, Abril 17, 2021) -using DataFrames # load package +using DataFrames -# ## Manipulating columns of DataFrame +# ## Manipulando columnas de DataFrames #- -# ### Renaming columns +# ### Renombrando columnas # -# Let's start with a `DataFrame` of `Bool`s that has default column names. +# Empecemos con un `DataFrame` de `Bool`s y nombres de columnas por default. x = DataFrame(Bool, 3, 4) -# With `rename`, we create new `DataFrame`; here we rename the column `:x1` to `:A`. (`rename` also accepts collections of Pairs.) +# Con `rename`, creamos un nuevo `DataFrame`; aquí renombramos la columna `:x1` a `:A`. (`rename` sólo acepta colleciones de Pairs.) rename(x, :x1 => :A) -# With `rename!` we do an in place transformation. +# Con `rename!` hacemos una transformación in situ. # -# This time we've applied a function to every column name. +# Esta vez hemos aplicado una función a cada nombre de columna. rename!(c -> Symbol(string(c)^2), x) -# We can also change the name of a particular column without knowing the original. +# Podemos tambíen cambiar el nombre particular de una columna sin conocer el nombre original. # -# Here we change the name of the third column, creating a new `DataFrame`. +# Aquí cambiamos el nombre de la tercera columna, creando un nuevo `DataFrame`. rename(x, names(x)[3] => :third) -# With `names!`, we can change the names of all variables. +# Con `names!`., podemos cambiar el nombre de todas las variables. names!(x, [:a, :b, :c, :d]) -# We get an error when we try to provide duplicate names +# Obtenemos un error si proveemos nombres duplicados names!(x, fill(:a, 4)) -# unless we pass `makeunique=true`, which allows us to handle duplicates in passed names. +# A no ser que pasemos el argumento `makeunique=true`, lo cual nos permite pasar nombres duplicados. names!(x, fill(:a, 4), makeunique=true) -# ### Reordering columns +# ### Reorganizar columnas #- -# We can reorder the names(x) vector as needed, creating a new DataFrame. +# Podemos reorganizar el vector `names(x)` como sea necesario, creando un nuevo DataFrame en el proceso. srand(1234) x[shuffle(names(x))] -# also `permutecols!` will be introduced in next release of DataFrames +# `permutecols!` estará disponible en la siguiente versión de DataFrames. #- -# ### Merging/adding columns +# ### Juntando/dividiendo columnas x = DataFrame([(i,j) for i in 1:3, j in 1:4]) -# With `hcat` we can merge two `DataFrame`s. Also [x y] syntax is supported but only when DataFrames have unique column names. +# Con `hcat` (*con*catenación *h*orizontal), podemos juntar 2 `DataFrame`s. `[x y]` también es sintaxis válida pero sólo cuando los DataFrames tienen nombres de columnas únicos. hcat(x, x, makeunique=true) -# We can also use `hcat` to add a new column; a default name `:x1` will be used for this column, so `makeunique=true` is needed. +# Podemos igual usar `hcat` para agregar una nueva columna - por default se usará el nombre `:x1` para esta columna, entonces se requiere el uso de `makeunique=true`. y = hcat(x, [1,2,3], makeunique=true) -# You can also prepend a vector with `hcat`. +# También puedes anteponer un vector con `hcat`. hcat([1,2,3], x, makeunique=true) -# Alternatively you could append a vector with the following syntax. This is a bit more verbose but cleaner. +# Alternativamente se puede anexar un vector con la siguiente sintaxis. Es menos terso pero es más limpio. y = [x DataFrame(A=[1,2,3])] -# Here we do the same but add column `:A` to the front. +# Hacemos lo mismo pero agregamos la columna `:A` al frente. y = [DataFrame(A=[1,2,3]) x] -# A column can also be added in the middle. Here a brute-force method is used and a new DataFrame is created. +# Una columna también se puede agregar en medio. Aquí se usa un método de fuerza bruta y se crea un nuevo DataFrame. using BenchmarkTools @btime [$x[1:2] DataFrame(A=[1,2,3]) $x[3:4]] -# We could also do this with a specialized in place method `insert!`. Let's add `:newcol` to the `DataFrame` `y`. +# También podemos sar un método especializado in situ de `insert!`. Agreguemos `:newcol` al `DataFrame` `y`. insert!(y, 2, [1,2,3], :newcol) -# If you want to insert the same column name several times `makeunique=true` is needed as usual. +# Si quieres insertar la misma columna varias veces, se requiere `makeunique=true` como de costumbre. insert!(y, 2, [1,2,3], :newcol, makeunique=true) -# We can see how much faster it is to insert a column with `insert!` than with `hcat` using `@btime`. +# Podemos ver que tanto más rápido es insertar una columna con `insert!` que con `hcat` si usamos `@btime`. @btime insert!(copy($x), 3, [1,2,3], :A) -# Let's use `insert!` to append a column in place, +# Usemos `insert!` para anteponer una columna in situ, insert!(x, ncol(x)+1, [1,2,3], :A) -# and to in place prepend a column. +# y para anexar una columna in situ igual. insert!(x, 1, [1,2,3], :B) -# With `merge!`, let's merge the second DataFrame into first, but overwriting duplicates. +# Con `merge!`, juntemos el segundo DataFrame al primero, pero sobreescribiendo duplicados. df1 = DataFrame(x=1:3, y=4:6) df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13) df1, df2, merge!(df1, df2) -# For comparison: merge two `DataFrames`s but renaming duplicate names via `hcat`. +# Para comparar: une 2 `DataFrame`s pero renombrando los duplicados via `hcat`. df1 = DataFrame(x=1:3, y=4:6) df2 = DataFrame(x='a':'c', z = 'd':'f', new=11:13) hcat(df1, df2, makeunique=true) -# ### Subsetting/removing columns +# ### Quitando / subponiendo columnas # -# Let's create a new `DataFrame` `x` and show a few ways to create DataFrames with a subset of `x`'s columns. +# Creemos un nuevo `DataFrame` `x` y mostremos algunas maneras de crear DataFrames con un subconjunto de columnas de `x`. x = DataFrame([(i,j) for i in 1:3, j in 1:5]) -# First we could do this by index +# Primero, podemos hacer esto usando el índice x[[1,2,4,5]] -# or by column name. +# ó el nombre de la columna. x[[:x1, :x4]] -# We can also choose to keep or exclude columns by `Bool`. (We need a vector whose length is the number of columns in the original `DataFrame`.) +# Podemos escoger quedar o deshechar columnas excluidas por `Bool`. (Necesitamos un vector cuya longitud es el número de columnas orignales de `DataFrame`.) x[[true, false, true, false, true]] -# Here we create a single column `DataFrame`, +# Aquí creamos una sola columna de un `DataFrame`, x[[:x1]] -# and here we access the vector contained in column `:x1`. +# Y aquí accesamos el vector contenido en la columna `:x1`. x[:x1] -# We could grab the same vector by column number +# Podemos agarrar el mismo vector por el número de la columna x[1] -# and remove everything from a `DataFrame` with `empty!`. +# y borrar todo dentro del `DataFrame` con `empty!`. empty!(y) -# Here we create a copy of `x` and delete the 3rd column from the copy with `delete!`. +# Ahora creamos una copia de `x` y borramos la 3ra columna de la copia con `delete!`. z = copy(x) x, delete!(z, 3) -# ### Modify column by name +# ### Modificar columnas por nombre x = DataFrame([(i,j) for i in 1:3, j in 1:5]) -# With the following syntax, the existing column is modified without performing any copying. +# Con la siguiente sintaxis, la columna preexistente se modifica sin copias. x[:x1] = x[:x2] x -# We can also use the following syntax to add a new column at the end of a `DataFrame`. +# Podemos usar la siguiente sintaxis para agregar una nueva columna al final del `DataFrame`. x[:A] = [1,2,3] x -# A new column name will be added to our `DataFrame` with the following syntax as well (7 is equal to `ncol(x)+1`). +# Una nueva columna se agregará a nuestro `DataFrame` con la siguiente sintaxis también (`7 == ncol(x) + 1`). x[7] = 11:13 x -# ### Find column name +# ### Encontrar el nombre de columnas x = DataFrame([(i,j) for i in 1:3, j in 1:5]) -# We can check if a column with a given name exists via +# Podemos revisar si una columna con un nombre dado existe via :x1 in names(x) -# and determine its index via +# y determinar su índice via findfirst(names(x), :x2) From 9769314d40926c964e52a43755f171ec3ee0fb51 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 09:12:20 -0500 Subject: [PATCH 08/24] spanish translation of 06_rows.jl --- literate_notebooks/src-ES/06_rows.jl | 83 ++++++++++++++-------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/literate_notebooks/src-ES/06_rows.jl b/literate_notebooks/src-ES/06_rows.jl index 3660e40..b67d333 100644 --- a/literate_notebooks/src-ES/06_rows.jl +++ b/literate_notebooks/src-ES/06_rows.jl @@ -1,121 +1,122 @@ -# # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** +# (Traducción por Miguel Raz, Abril 18, 2021) -using DataFrames # load package +using DataFrames # cargar paquete srand(1); -# ## Manipulating rows of DataFrame +# ## Manipulando filas de DataFrames #- -# ### Reordering rows +# ### Reordenando filas -x = DataFrame(id=1:10, x = rand(10), y = [zeros(5); ones(5)]) # and we hope that x[:x] is not sorted :) +x = DataFrame(id=1:10, x = rand(10), y = [zeros(5); ones(5)]) # y esperamos que x[:x] no esté ordenado #- -issorted(x), issorted(x, :x) # check if a DataFrame or a subset of its columns is sorted +issorted(x), issorted(x, :x) # checamos si el DataFrame o un subconjunto de sus columnas están ordenadas. #- -sort!(x, :x) # sort x in place +sort!(x, :x) # ordenar in situ #- -y = sort(x, :id) # new DataFrame +y = sort(x, :id) # nuevo DataFrame #- -sort(x, (:y, :x), rev=(true, false)) # sort by two columns, first is decreasing, second is increasing +sort(x, (:y, :x), rev=(true, false)) # ordenaro por dos columnas, la primera en orden descendiente y la segunda en ascendiente #- -sort(x, (order(:y, rev=true), :x)) # the same as above +sort(x, (order(:y, rev=true), :x)) # igual que arriba #- -sort(x, (order(:y, rev=true), order(:x, by=v->-v))) # some more fancy sorting stuff +sort(x, (order(:y, rev=true), order(:x, by=v->-v))) # más cosas muy sofisticadas para ordenar #- -x[shuffle(1:10), :] # reorder rows (here randomly) +x[shuffle(1:10), :] # reorganizar filas (aquí aleatoriamente) #- sort!(x, :id) -x[[1,10],:] = x[[10,1],:] # swap rows +x[[1,10],:] = x[[10,1],:] # intercambiar filas x #- -x[1,:], x[10,:] = x[10,:], x[1,:] # and swap again +x[1,:], x[10,:] = x[10,:], x[1,:] # y otra vez x -# ### Merging/adding rows +# ### Unir/añadir filasMerging/adding rows x = DataFrame(rand(3, 5)) #- -[x; x] # merge by rows - data frames must have the same column names; the same is vcat +[x; x] # unir por filas - los DataFrames deben tener los mismos nombres de columnas. Esto equivale a usar `vcat`. #- -y = x[reverse(names(x))] # get y with other order of names +y = x[reverse(names(x))] # asignar `y` con otro orden para los nombres #- -vcat(x, y) # we get what we want as vcat does column name matching +vcat(x, y) # Equivalente, pues `vcat` matchea las columnas con nombres iguales #- -vcat(x, y[1:3]) # but column names must still match +vcat(x, y[1:3]) # pero los nombres de las columnas deben ser iguales #- -append!(x, x) # the same but modifies x +append!(x, x) # lo mismo pero modificando x #- -append!(x, y) # here column names must match exactly +append!(x, y) # aquí los nombres de las columnas deben ser match exactos #- -push!(x, 1:5) # add one row to x at the end; must give correct number of values and correct types +push!(x, 1:5) # agrega 1 fila a `x` al final; debe dar los valores y tipos correctos x #- -push!(x, Dict(:x1=> 11, :x2=> 12, :x3=> 13, :x4=> 14, :x5=> 15)) # also works with dictionaries +push!(x, Dict(:x1=> 11, :x2=> 12, :x3=> 13, :x4=> 14, :x5=> 15)) # sirve igual con diccionarios x -# ### Subsetting/removing rows +# ### Quitar/Subponer filas x = DataFrame(id=1:10, val='a':'j') #- -x[1:2, :] # by index +x[1:2, :] # por su índice #- -view(x, 1:2) # the same but a view +view(x, 1:2) # igual pero con un `view` #- -x[repmat([true, false], 5), :] # by Bool, exact length required +x[repmat([true, false], 5), :] # por `Bool`, requiere longitud exacta #- -view(x, repmat([true, false], 5), :) # view again +view(x, repmat([true, false], 5), :) # `view` otra vez #- -deleterows!(x, 7) # delete one row +deleterows!(x, 7) # borrar 1 fila #- -deleterows!(x, 6:7) # delete a collection of rows +deleterows!(x, 6:7) # borrar una colleción de filas #- @@ -123,11 +124,11 @@ x = DataFrame([1:4, 2:5, 3:6]) #- -filter(r -> r[:x1] > 2.5, x) # create a new DataFrame where filtering function operates on DataFrameRow +filter(r -> r[:x1] > 2.5, x) # crear un nuevo DataFrame donde la función de filtrado opera sobre un `DataFrameRow` #- -## in place modification of x, an example with do-block syntax +## Modificar `x` in situ, con sintaxis de `do-block` filter!(x) do r if r[:x1] > 2.5 return r[:x2] < 4.5 @@ -135,7 +136,7 @@ filter!(x) do r r[:x3] < 3.5 end -# ### Deduplicating +# ### Desduplicación x = DataFrame(A=[1,2], B=["x","y"]) append!(x, x) @@ -144,34 +145,34 @@ x #- -unique(x, [1,2]) # get first unique rows for given index +unique(x, [1,2]) # Sacar la primera fila única dado un índice #- -unique(x) # now we look at whole rows +unique(x) # Ahora vemos las filas #- -nonunique(x, :A) # get indicators of non-unique rows +nonunique(x, :A) # Sacar indicadores de filas no únicas #- -unique!(x, :B) # modify x in place +unique!(x, :B) # modificar x in situ -# ### Extracting one row from `DataFrame` into a vector +# ### Extraer una fila de `DataFrame` a un vector x = DataFrame(x=[1,missing,2], y=["a", "b", missing], z=[true,false,true]) #- cols = [:x, :y] -[x[1, col] for col in cols] # subset of columns +[x[1, col] for col in cols] # subconjunto de columnas #- -[[x[i, col] for col in names(x)] for i in 1:nrow(x)] # vector of vectors, each entry contains one full row of x +[[x[i, col] for col in names(x)] for i in 1:nrow(x)] # vector de vectores, cada entrada contiene una sola fila de `x` #- -Tuple(x[1, col] for col in cols) # similar construct for Tuples, when ported to Julia 0.7 NamedTuples will be added +Tuple(x[1, col] for col in cols) # construcción similar para tuplas (`Tuples`), y tuplas nombradas (`NamedTuples`) post Julia 0.7. From 3bb8157c6f7c12fd983fc7d214a275f8e175a7a9 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 09:32:15 -0500 Subject: [PATCH 09/24] spanish translation of 07_factors.jl --- literate_notebooks/src-ES/07_factors.jl | 110 ++++++++++++------------ 1 file changed, 56 insertions(+), 54 deletions(-) diff --git a/literate_notebooks/src-ES/07_factors.jl b/literate_notebooks/src-ES/07_factors.jl index a3ff03c..441667b 100644 --- a/literate_notebooks/src-ES/07_factors.jl +++ b/literate_notebooks/src-ES/07_factors.jl @@ -1,110 +1,111 @@ # # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** +# (Traducción por Miguel Raz Guzmán, 18 de Abril de 2021) -using DataFrames # load package +using DataFrames -# ## Working with CategoricalArrays +# ## Trabajando con CategoricalArrays #- -# ### Constructor +# ### Constructores -x = categorical(["A", "B", "B", "C"]) # unordered +x = categorical(["A", "B", "B", "C"]) # sin orded #- -y = categorical(["A", "B", "B", "C"], ordered=true) # ordered, by default order is sorting order +y = categorical(["A", "B", "B", "C"], ordered=true) # con orden por default #- -z = categorical(["A","B","B","C", missing]) # unordered with missings +z = categorical(["A","B","B","C", missing]) # sin orden con `missing`s (valores faltantes) #- -c = cut(1:10, 5) # ordered, into equal counts, possible to rename labels and give custom breaks +c = cut(1:10, 5) # ordenados, en contenedores iguales. Es posible renombrar etiquetas y dar `break` customizadas. #- -by(DataFrame(x=cut(randn(100000), 10)), :x, d -> DataFrame(n=nrow(d)), sort=true) # just to make sure it works right +by(DataFrame(x=cut(randn(100000), 10)), :x, d -> DataFrame(n=nrow(d)), sort=true) # checar que todo funciona #- -v = categorical([1,2,2,3,3]) # contains integers not strings +v = categorical([1,2,2,3,3]) # contiene enteros y no cadenas #- -Vector{Union{String, Missing}}(z) # sometimes you need to convert back to a standard vector +Vector{Union{String, Missing}}(z) # a veces hay que convertir de regreso a un vector estándar -# ### Managing levels +# ### Manejando niveles arr = [x,y,z,c,v] #- -isordered.(arr) # chcek if categorical array is orderd +isordered.(arr) # checar si el arreglo categórico está ordenado #- -ordered!(x, true), isordered(x) # make x ordered +ordered!(x, true), isordered(x) # ordenar x #- -ordered!(x, false), isordered(x) # and unordered again - +ordered!(x, false), isordered(x) # y desordanrlo otra vez. #- -levels.(arr) # list levels +levels.(arr) # niveles de lista #- -unique.(arr) # missing will be included +unique.(arr) # incluye `missing` #- -y[1] < y[2] # can compare as y is ordered +y[1] < y[2] # puede comparar `y` como si fuese ordenado #- -v[1] < v[2] # not comparable, v is unordered although it contains integers +v[1] < v[2] # no comparable, `v` no tiene orden aunque contenga enteros #- -levels!(y, ["C", "B", "A"]) # you can reorder levels, mostly useful for ordered CategoricalArrays +levels!(y, ["C", "B", "A"]) # puedes reordenar niveles, muy útil para CategoricalArrays ordenados #- -y[1] < y[2] # observe that the order is changed +y[1] < y[2] # notar que el orden cambió #- -levels!(z, ["A", "B"]) # you have to specify all levels that are present +levels!(z, ["A", "B"]) # debes declarar todos los niveles presentes #- -levels!(z, ["A", "B"], allow_missing=true) # unless the underlying array allows for missings and force removal of levels +levels!(z, ["A", "B"], allow_missing=true) # a menos que el arreglo subyacente permita `missing`s y obligue a quitar niveles #- z[1] = "B" -z # now z has only "B" entries +z # ahora `z` sólo tiene entradas "B" #- -levels(z) # but it remembers the levels it had (the reason is mostly performance) +levels(z) # Pero recuerda los niveles que tuvo antes (esto para mejorar performance) #- -droplevels!(z) # this way we can clean it up +droplevels!(z) # Así los podemos quitar completamente levels(z) -# ### Data manipulation +# ### Manipulación de datos x, levels(x) #- x[2] = "0" -x, levels(x) # new level added at the end (works only for unordered) +x, levels(x) # agrega nuevo nivel al final (funcona solo para casos no ordenados) #- @@ -112,43 +113,43 @@ v, levels(v) #- -v[1] + v[2] # even though underlying data is Int, we cannot operate on it +v[1] + v[2] # aunque los datos subyacentes son `Int`s, no podemos operar sobre ellos #- -Vector{Int}(v) # you have either to retrieve the data by conversion (may be expensive) +Vector{Int}(v) # tienes que recuperar los datos via conversión (potencialmente costoso) #- -get(v[1]) + get(v[2]) # or get a single value +get(v[1]) + get(v[2]) # o sacar un solo valor #- -get.(v) # this will work for arrays witout missings +get.(v) # esto funciona para arreglos sin `missings` #- -get.(z) # but will fail on missing values +get.(z) # pero falla si hay valores faltantes #- -Vector{Union{String, Missing}}(z) # you have to do the conversion +Vector{Union{String, Missing}}(z) # tienes que hacer la conversión #- -z[1]*z[2], z.^2 # the only exception are CategoricalArrays based on String - you can operate on them normally +z[1]*z[2], z.^2 # la única excepción son los `CategoricalArrays` basados en `String` - ahí puedes operar con normalidad #- -recode([1,2,3,4,5,missing], 1=>10) # recode some values in an array; has also in place recode! equivalent +recode([1,2,3,4,5,missing], 1=>10) # recodificar algunos valores en el arreglo; existe el equivalente `recode!` in situ #- -recode([1,2,3,4,5,missing], "a", 1=>10, 2=>20) # here we provided a default value for not mapped recodings +recode([1,2,3,4,5,missing], "a", 1=>10, 2=>20) # aquí proveemos un valor default para los mapos que no se puedieron recodificar #- -recode([1,2,3,4,5,missing], 1=>10, missing=>"missing") # to recode Missing you have to do it explicitly +recode([1,2,3,4,5,missing], 1=>10, missing=>"missing") # para recodificar a `Missing` lo tienes que hacer explícitamente #- @@ -158,64 +159,65 @@ t, levels(t) #- recode!(t, [1,3]=>2) -t, levels(t) # note that the levels are dropped after recode +t, levels(t) # notar que los níveles se borran después de `recode!` #- t = categorical([1,2,3], ordered=true) -levels(recode(t, 2=>0, 1=>-1)) # and if you introduce a new levels they are added at the end in the order of appearance +levels(recode(t, 2=>0, 1=>-1)) # y si agregas nuevos niveles, se ponen al final en el orden que se declararon #- -t = categorical([1,2,3,4,5], ordered=true) # when using default it becomes the last level +t = categorical([1,2,3,4,5], ordered=true) # el default es que se use el último nivel levels(recode(t, 300, [1,2]=>100, 3=>200)) -# ### Comparisons +# ### Comparaciones x = categorical([1,2,3]) xs = [x, categorical(x), categorical(x, ordered=true), categorical(x, ordered=true)] levels!(xs[2], [3,2,1]) levels!(xs[4], [2,3,1]) -[a == b for a in xs, b in xs] # all are equal - comparison only by contents +[a == b for a in xs, b in xs] # todos son iguales - compara sólo por contenidos #- -signature(x::CategoricalArray) = (x, levels(x), isordered(x)) # this is actually the full signature of CategoricalArray -## all are different, notice that x[1] and x[2] are unordered but have a different order of levels + +signature(x::CategoricalArray) = (x, levels(x), isordered(x)) # Esto es de hecho la asignación completa de un CategoricalArray - TODO? +## todos son distintos, notemos que `x[1]` y `x[2]` no están ordenados pero tiene distintos órdenes de niveles [signature(a) == signature(b) for a in xs, b in xs] #- -x[1] < x[2] # you cannot compare elements of unordered CategoricalArray +x[1] < x[2] # no puedes comparar elementos de un CategoricalArray no ordenado #- -t[1] < t[2] # but you can do it for an ordered one +t[1] < t[2] # pero sí para uno ordenado #- -isless(x[1], x[2]) # isless works within the same CategoricalArray even if it is not ordered +isless(x[1], x[2]) # isless funciona dentro del mismo CategoricalArray aún si no está ordenado #- -y = deepcopy(x) # but not across categorical arrays +y = deepcopy(x) # pero no a través de arreglos categóricos isless(x[1], y[2]) #- -isless(get(x[1]), get(y[2])) # you can use get to make a comparison of the contents of CategoricalArray +isless(get(x[1]), get(y[2])) # puedes usar `get` para hacer una comparación de contenidos de un `CategoricalArray` #- -x[1] == y[2] # equality tests works OK across CategoricalArrays +x[1] == y[2] # las pruebas de igualdad funcionan a través de CategoricalArrays -# ### Categorical columns in a DataFrame +# ### Columnas categóricas en un DataFrame df = DataFrame(x = 1:3, y = 'a':'c', z = ["a","b","c"]) #- -categorical!(df) # converts all eltype(AbstractString) columns to categorical +categorical!(df) # conviertir todos las columnas `eltype(AbstractString)` a columnas categóricas #- @@ -223,7 +225,7 @@ showcols(df) #- -categorical!(df, :x) # manually convert to categorical column :x +categorical!(df, :x) # convertir manualmente `:x` a una columna categórica #- From b4a9c49dda66dfe60e13c57477a165e2ecd81cd3 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 09:37:50 -0500 Subject: [PATCH 10/24] spanish translation of 08_joins.jl --- literate_notebooks/src-ES/08_joins.jl | 28 ++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/literate_notebooks/src-ES/08_joins.jl b/literate_notebooks/src-ES/08_joins.jl index e52bc22..28036f6 100644 --- a/literate_notebooks/src-ES/08_joins.jl +++ b/literate_notebooks/src-ES/08_joins.jl @@ -1,13 +1,15 @@ # # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2017** +# (Traducción por Miguel Raz Guzmán Macedo, 18 de Abril 2021) -using DataFrames # load package +using DataFrames # cargar paquete -# ## Joining DataFrames +# ## Uniendo DataFrames (Joins) #- -# ### Preparing DataFrames for a join +# ### Preparando DataFrames para un join x = DataFrame(ID=[1,2,3,4,missing], name = ["Alice", "Bob", "Conor", "Dave","Zed"]) y = DataFrame(id=[1,2,5,6,missing], age = [21,22,23,24,99]) @@ -15,11 +17,11 @@ x,y #- -rename!(x, :ID=>:id) # names of columns on which we want to join must be the same +rename!(x, :ID=>:id) # los nombres de columnas que queremos unir deben ser iguales -# ### Standard joins: inner, left, right, outer, semi, anti +# ### Joins estándar: inner, left, right, outer, semi, anti -join(x, y, on=:id) # :inner join by default, missing is joined +join(x, y, on=:id) # :inner join por default, los missings se unen #- @@ -43,16 +45,16 @@ join(x, y, on=:id, kind=:anti) # ### Cross join -## cross-join does not require on argument -## it produces a Cartesian product or arguments -function expand_grid(;xs...) # a simple replacement for expand.grid in R +## un cross join no requiere un argumento +## produce un producto cartesiano de sus argumentos +function expand_grid(;xs...) # un reemplazo sencillo para expand.grid en R reduce((x,y) -> join(x, DataFrame(Pair(y...)), kind=:cross), DataFrame(Pair(xs[1]...)), xs[2:end]) end expand_grid(a=[1,2], b=["a","b","c"], c=[true,false]) -# ### Complex cases of joins +# ### Casos complejos de joins x = DataFrame(id1=[1,1,2,2,missing,missing], id2=[1,11,2,21,missing,99], @@ -64,13 +66,13 @@ x,y #- -join(x, y, on=[:id1, :id2]) # joining on two columns +join(x, y, on=[:id1, :id2]) # join de 2 columnas #- -join(x, y, on=[:id1], makeunique=true) # with duplicates all combinations are produced (here :inner join) +join(x, y, on=[:id1], makeunique=true) # cuando hay duplicados, se producen todas las combinaciones (:inner join en este caso) #- -join(x, y, on=[:id1], kind=:semi) # but not by :semi join (as it would duplicate rows) +join(x, y, on=[:id1], kind=:semi) # pero no por un :semi join (pues duplicaría filas) From f461dd04afacb5be328d7a6d00f89b19ebb740f3 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 11:31:42 -0500 Subject: [PATCH 11/24] spanish translation of 09_reshaping.jl --- literate_notebooks/src-ES/09_reshaping.jl | 36 ++++++++++++----------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/literate_notebooks/src-ES/09_reshaping.jl b/literate_notebooks/src-ES/09_reshaping.jl index d6ec25b..0aa4fbe 100644 --- a/literate_notebooks/src-ES/09_reshaping.jl +++ b/literate_notebooks/src-ES/09_reshaping.jl @@ -1,38 +1,40 @@ -# # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** +# (Traducido por Miguel Raz Guzmán Macedo, 18 de Abril de 2021) -using DataFrames # load package +using DataFrames -# ## Reshaping DataFrames +# ## Reorganización de DataFrames #- -# ### Wide to long +# ### De ancho a largo x = DataFrame(id=[1,2,3,4], id2=[1,1,2,2], M1=[11,12,13,14], M2=[111,112,113,114]) #- -melt(x, :id, [:M1, :M2]) # first pass id-variables and then measure variables; meltdf makes a view +melt(x, :id, [:M1, :M2]) # primero pasamos las variables de identificación y luego las medioms; `meltdf` (melt = derretir) crea un `view` #- -## optionally you can rename columns; melt and stack are identical but order of arguments is reversed +## opcionalmente puedes renombrar las columnas; `melt` y `stack` son idénticos, pero el orden de los argumentos se invierte stack(x, [:M1, :M2], :id, variable_name=:key, value_name=:observed) # first measures and then id-s; stackdf creates view #- -## if second argument is omitted in melt or stack , all other columns are assumed to be the second argument -## but measure variables are selected only if they are <: AbstractFloat +## si el segundo argumento se omite en `melt` o `stack`, todas las otras colummnas se asumen como en el segundo argumento +## pero las variables de medida se seleccionan sólo si <: AbstractFloat (es un subtipo de AbstractFloat) +# melt(x, [:id, :id2]) #- -melt(x, [1, 2]) # you can use index instead of symbol +melt(x, [1, 2]) # puedes usar un índice en vez de un símbolo #- -bigx = DataFrame(rand(10^6, 10)) # a test comparing creation of new DataFrame and a view +bigx = DataFrame(rand(10^6, 10)) # un test comparando la creación de un nuevo DataFrame y un `view` bigx[:id] = 1:10^6 @time melt(bigx, :id) @time melt(bigx, :id) @@ -49,15 +51,15 @@ melt(x) #- -melt(DataFrame(rand(3,2))) # by default stack and melt treats floats as value columns +melt(DataFrame(rand(3,2))) # por default, `stack` y `melt` tratan flotantes como columnas de valores #- df = DataFrame(rand(3,2)) df[:key] = [1,1,1] -mdf = melt(df) # duplicates in key are silently accepted +mdf = melt(df) # las llaves duplicadas se aceptan silenciosamente -# ### Long to wide +# ### Largo a ancho x = DataFrame(id = [1,1,1], id2=['a','b','c'], a1 = rand(3), a2 = rand(3)) @@ -69,15 +71,15 @@ display(y) #- -unstack(y, :id2, :variable, :value) # stndard unstack with a unique key +unstack(y, :id2, :variable, :value) # `unstack` estándar con llave única #- -unstack(y, :variable, :value) # all other columns are treated as keys +unstack(y, :variable, :value) # todas las otras columnas se tratan como llaves #- -## by default :id, :variable and :value names are assumed; in this case it produces duplicate keys +## por default `:id`, `:variable` y `:value` se asumen como nombres; en este caso eso produce llaves duplicadas unstack(y) #- @@ -86,5 +88,5 @@ df = stack(DataFrame(rand(3,2))) #- -unstack(df, :variable, :value) # unable to unstack when no key column is present +unstack(df, :variable, :value) # imposible hacer `unstack` cuando no hay columna-llave presente From 73a8f67d4e923d21084dba99ad2eb5fea57de375 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 11:38:59 -0500 Subject: [PATCH 12/24] spanish translation of 10_transforms.jl --- literate_notebooks/src-ES/10_transforms.jl | 33 ++++++++++++---------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/literate_notebooks/src-ES/10_transforms.jl b/literate_notebooks/src-ES/10_transforms.jl index 3b5b4aa..4cca725 100644 --- a/literate_notebooks/src-ES/10_transforms.jl +++ b/literate_notebooks/src-ES/10_transforms.jl @@ -1,9 +1,11 @@ # # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** +# (Traducción por Miguel Raz Guzmán Macedo, 18 de abril de 2021) -using DataFrames # load package +using DataFrames # cargar paquetería -# ## Split-apply-combine +# ## "Split-apply-combine" - Dividir, aplicar, combinar x = DataFrame(id=[1,2,3,4,1,2,3,4], id2=[1,2,1,2,1,2,1,2], v=rand(8)) @@ -17,7 +19,7 @@ gx2 = groupby(x, [:id, :id2]) #- -vcat(gx2...) # back to the original DataFrame +vcat(gx2...) # de regreso al DataFrame original #- @@ -25,56 +27,57 @@ x = DataFrame(id = [missing, 5, 1, 3, missing], x = 1:5) #- -showall(groupby(x, :id)) # by default groups include mising values and are not sorted +showall(groupby(x, :id)) # por default los grupos incluyen valores faltantes (`missing`) y no están ordenados #- -showall(groupby(x, :id, sort=true, skipmissing=true)) # but we can change it :) +showall(groupby(x, :id, sort=true, skipmissing=true)) # pero se puede cambiar :) #- x = DataFrame(id=rand('a':'d', 100), v=rand(100)); -by(x, :id, y->mean(y[:v])) # apply a function to each group of a data frame +by(x, :id, y->mean(y[:v])) # aplica una función a cada grupo de un DataFrame #- -by(x, :id, y->mean(y[:v]), sort=true) # we can sort the output +by(x, :id, y->mean(y[:v]), sort=true) # podemos ordenar el output #- -by(x, :id, y->DataFrame(res=mean(y[:v]))) # this way we can set a name for a column - DataFramesMeta @by is better +by(x, :id, y->DataFrame(res=mean(y[:v]))) # de esta manera podemos fijar el nombre de una columna - `DataFramesMeta @by` es mejor #- x = DataFrame(id=rand('a':'d', 100), x1=rand(100), x2=rand(100)) -aggregate(x, :id, sum) # apply a function over all columns of a data frame in groups given by id +aggregate(x, :id, sum) # aplica la función sobre todas las columnas de un DataFarme en grupos dados por `:id` #- -aggregate(x, :id, sum, sort=true) # also can be sorted +aggregate(x, :id, sum, sort=true) # también se puede ordenar # *We omit the discussion of of map/combine as I do not find them very useful (better to use by)* +# *Omitimos la discusión de `map/combine/` pues no las encuentre tan útiles - mejor usar `by`* x = DataFrame(rand(3, 5)) #- -map(mean, eachcol(x)) # map a function over each column and return a data frame +map(mean, eachcol(x)) # mapea una función a cada columna y regresa un DataFrame #- -foreach(c -> println(c[1], ": ", mean(c[2])), eachcol(x)) # a raw iteration returns a tuple with column name and values +foreach(c -> println(c[1], ": ", mean(c[2])), eachcol(x)) # una iteración a secas regresa una tupla con nombres columnares y valores #- -colwise(mean, x) # colwise is similar, but produces a vector +colwise(mean, x) # `colwise` es similar, pero produce un vector #- x[:id] = [1,1,2] -colwise(mean,groupby(x, :id)) # and works on GroupedDataFrame +colwise(mean,groupby(x, :id)) # y funciona en un `GroupedDataFrame` #- -map(r -> r[:x1]/r[:x2], eachrow(x)) # now the returned value is DataFrameRow which works similarly to a one-row DataFrame +map(r -> r[:x1]/r[:x2], eachrow(x)) # ahora el valor regresado es un `DataFrameRow` el cual funciona similar a un `DataFrame` de una sola fila From 49ea34a1a65a14c7e5c2f3f4606fd4fe93714d92 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 11:48:19 -0500 Subject: [PATCH 13/24] update topics in README.md --- literate_notebooks/src-ES/README.md | 37 ++++++++++++++--------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/literate_notebooks/src-ES/README.md b/literate_notebooks/src-ES/README.md index 4a98762..4817c1c 100644 --- a/literate_notebooks/src-ES/README.md +++ b/literate_notebooks/src-ES/README.md @@ -41,7 +41,7 @@ Corrido en Julia 1.5.3. Las dependencias del proyecto son las siguientes: [10745b16] Statistics ``` -I will try to keep the material up to date as the packages evolve. +Trataré de mantener el material actualizado con la evolución de los paquetes. Este tutorial cubre [DataFrames](https://github.com/JuliaData/DataFrames.jl) @@ -72,19 +72,19 @@ para más información sobre la ubicación y especificación de los kernels de J | Archivo | Tema | |-------------------------------------------------------------------------------------------------------------------|-----------------------------------| -| [01_constructors.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/01_constructors.ipynb) | Creating DataFrame and conversion | -| [02_basicinfo.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/02_basicinfo.ipynb) | Getting summary information | -| [03_missingvalues.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/03_missingvalues.ipynb) | Handling missing values | -| [04_loadsave.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/04_loadsave.ipynb) | Loading and saving DataFrames | -| [05_columns.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/05_columns.ipynb) | Working with columns of DataFrame | -| [06_rows.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/06_rows.ipynb) | Working with row of DataFrame | -| [07_factors.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/07_factors.ipynb) | Working with categorical data | -| [08_joins.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/08_joins.ipynb) | Joining DataFrames | -| [09_reshaping.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/09_reshaping.ipynb) | Reshaping DataFrames | -| [10_transforms.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/10_transforms.ipynb) | Transforming DataFrames | -| [11_performance.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/11_performance.ipynb) | Performance tips | -| [12_pitfalls.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/12_pitfalls.ipynb) | Possible pitfalls | -| [13_extras.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/13_extras.ipynb) | Additional interesting packages | +| [01_constructors.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/01_constructors.ipynb) | Creación y construcción de DataFrames | +| [02_basicinfo.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/02_basicinfo.ipynb) | Consiguiendo información | +| [03_missingvalues.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/03_missingvalues.ipynb) | Manejo de valores faltantes | +| [04_loadsave.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/04_loadsave.ipynb) | Cargar y guardar DataFrames | +| [05_columns.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/05_columns.ipynb) | Trabajando columnas de DataFrames | +| [06_rows.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/06_rows.ipynb) | Trabajando con filas de DataFrames | +| [07_factors.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/07_factors.ipynb) | Trabajando con datos categóricos | +| [08_joins.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/08_joins.ipynb) | Uniendo DataFrames (Joins) | +| [09_reshaping.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/09_reshaping.ipynb) | Reorganizando DataFrames | +| [10_transforms.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/10_transforms.ipynb) | Transformando DataFrames | +| [11_performance.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/11_performance.ipynb) | Tips de performance | +| [12_pitfalls.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/12_pitfalls.ipynb) | Posibles errores y descuidos | +| [13_extras.ipynb](https://github.com/bkamins/Julia-DataFrames-Tutorial/blob/master/13_extras.ipynb) | Paquetes adicionales interesantes | Changelog: @@ -130,17 +130,16 @@ Changelog: | 2020-11-26 | Updated to DataFramesMeta.jl 0.6; update by @pdeffebach | # Resumen de funciones clave: - 1. Constructors: `DataFrame`, `DataFrame!`, `Tables.rowtable`, `Tables.columntable`, `Matrix`, `eachcol`, `eachrow`, `Tables.namedtupleiterator`, `empty`, `empty!` 2. Descripciones: `size`, `nrow`, `ncol`, `describe`, `names`, `eltypes`, `first`, `last`, `getindex`, `setindex!`, `@view`, `isapprox` -3. Manejo de missing: `missing` (singleton instance of `Missing`), `ismissing`, `nonmissingtype`, `skipmissing`, `replace`, `replace!`, `coalesce`, `allowmissing`, `disallowmissing`, `allowmissing!`, `completecases`, `dropmissing`, `dropmissing!`, `disallowmissing`, `disallowmissing!`, `passmissing` -4. Cargando y guardando archivos: `CSV` (package), `CSVFiles` (package), `Serialization` (module), `CSV.read`, `CSV.write`, `save`, `load`, `serialize`, `deserialize`, `Arrow.write`, `Arrow.Table` (from Arrow.jl package), `JSONTables` (package), `arraytable`, `objecttable`, `jsontable`, `CodecZlib` (module), `GzipCompressorStream`, `GzipDecompressorStream`, `JDF.jl` (package), `JDF.savejdf`, `JDF.loadjdf`, `JLSO.jl` (package), `JLSO.save`, `JLSO.load`, `ZipFile.jl` (package), `ZipFile.reader`, `ZipFile.writer`, `ZipFile.addfile` +3. Manejo de missing: `missing` (tipo singulete de `Missing`), `ismissing`, `nonmissingtype`, `skipmissing`, `replace`, `replace!`, `coalesce`, `allowmissing`, `disallowmissing`, `allowmissing!`, `completecases`, `dropmissing`, `dropmissing!`, `disallowmissing`, `disallowmissing!`, `passmissing` +4. Cargando y guardando archivos: `CSV` (paquete), `CSVFiles` (paquete), `Serialization` (módulo), `CSV.read`, `CSV.write`, `save`, `load`, `serialize`, `deserialize`, `Arrow.write`, `Arrow.Table` (del paquete Arrow.jl), `JSONTables` (paquete), `arraytable`, `objecttable`, `jsontable`, `CodecZlib` (módulo), `GzipCompressorStream`, `GzipDecompressorStream`, `JDF.jl` (paquete), `JDF.savejdf`, `JDF.loadjdf`, `JLSO.jl` (paquete), `JLSO.save`, `JLSO.load`, `ZipFile.jl` (paquete), `ZipFile.reader`, `ZipFile.writer`, `ZipFile.addfile` 5. Trabajando con columnas: `rename`, `rename!`, `hcat`, `insertcols!`, `categorical!`, `columnindex`, `hasproperty`, `select`, `select!`, `transform`, `transform!`, `combine`, `Not`, `All`, `Between`, `ByRow`, `AsTable` -6. Trabajando con filas: `sort!`, `sort`, `issorted`, `append!`, `vcat`, `push!`, `view`, `filter`, `filter!`, `delete!`, `unique`, `nonunique`, `unique!`, `repeat`, `parent`, `parentindices`, `flatten`, `@pipe` (from `Pipe` package), `only` +6. Trabajando con filas: `sort!`, `sort`, `issorted`, `append!`, `vcat`, `push!`, `view`, `filter`, `filter!`, `delete!`, `unique`, `nonunique`, `unique!`, `repeat`, `parent`, `parentindices`, `flatten`, `@pipe` (del paquete `Pipe.jl`), `only` 7. Trabajando con datos categóricos: `categorical`, `cut`, `isordered`, `ordered!`, `levels`, `unique`, `levels!`, `droplevels!`, `get`, `recode`, `recode!` 8. Joins: `innerjoin`, `leftjoin`, `rightjoin`, `outerjoin`, `semijoin`, `antijoin`, `crossjoin` 9. Reorganizando: `stack`, `unstack` -10. Transformadas: `groupby`, `mapcols`, `parent`, `groupcols`, `valuecols`, `groupindices`, `keys` (for `GroupedDataFrame`), `combine`, `select`, `select!`, `transform`, `transform!`, `@pipe` (from `Pipe` package) +10. Transformadas: `groupby`, `mapcols`, `parent`, `groupcols`, `valuecols`, `groupindices`, `keys` (for `GroupedDataFrame`), `combine`, `select`, `select!`, `transform`, `transform!`, `@pipe` (del paquete `Pipe.jl`) 11. Extras: * [FreqTables](https://github.com/nalimilan/FreqTables.jl): `freqtable`, `prop`, `Name` * [DataFramesMeta](https://github.com/JuliaStats/DataFramesMeta.jl): `@with`, `@where`, `@select`, `@transform`, `@orderby`, `@linq`, `@by`, `@combine`, `@eachrow`, `@newcol`, `^`, `cols` From 670e4418a891871713e9fdbf241db93717632b4d Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 12:02:02 -0500 Subject: [PATCH 14/24] spanish translation of 11_performance.jl --- literate_notebooks/src-ES/11_performance.jl | 44 +++++++++++---------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/literate_notebooks/src-ES/11_performance.jl b/literate_notebooks/src-ES/11_performance.jl index 005e877..95d64a2 100644 --- a/literate_notebooks/src-ES/11_performance.jl +++ b/literate_notebooks/src-ES/11_performance.jl @@ -1,22 +1,26 @@ # # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** +# (Traducción por Miguel Raz Guzmán Macedo, 18 de abril de 2021) using DataFrames using BenchmarkTools -# ## Performance tips +# ## Tips de Performance 🚀 #- -# ### Access by column number is faster than by name +# ### Accesar columnas por número es más rápido que por nombre x = DataFrame(rand(5, 1000)) @btime x[500]; @btime x[:x500]; -# ### When working with data `DataFrame` use barrier functions or type annotation +# ### Cuando trabajes con datos de `DataFrame`, usa funciones barrera o anotaciones de tipo +# Sobre funciones barrera: https://docs.julialang.org/en/v1/manual/performance-tips/ +# Las anotaciones de tipo se ven así: `x::Int64 = 2` -function f_bad() # this function will be slow +function f_bad() # esta función va a ser lenta 🐢 srand(1); x = DataFrame(rand(1000000,2)) y, z = x[1], x[2] p = 0.0 @@ -30,11 +34,11 @@ end #- -@code_warntype f_bad() # the reason is that Julia does not know the types of columns in `DataFrame` +@code_warntype f_bad() # la razón es que Julia no conoce los tipos de las columnas del `DataFrame` #- -## solution 1 is to use barrier function (it should be possible to use it in almost any code) +## la primera opción es funciones barrera (debería ser posible en casi cualquier código) function f_inner(y,z) p = 0.0 for i in 1:length(y) @@ -43,12 +47,12 @@ function f_inner(y,z) p end -function f_barrier() # extract the work to an inner function +function f_barrier() # extraemos el trabajo a una función interior srand(1); x = DataFrame(rand(1000000,2)) f_inner(x[1], x[2]) end -function f_inbuilt() # or use inbuilt function if possible +function f_inbuilt() # o usamos una función preestablecida si es posible srand(1); x = DataFrame(rand(1000000,2)) dot(x[1], x[2]) end @@ -58,8 +62,8 @@ end #- -## solution 2 is to provide the types of extracted columns -## it is simpler but there are cases in which you will not know these types +## la opción 2 es proveer el tipo de las columnas extraídas - lo cual +## es más sencillo pero hay casos donde no se va a poder saber sus tipos function f_typed() srand(1); x = DataFrame(rand(1000000,2)) y::Vector{Float64}, z::Vector{Float64} = x[1], x[2] @@ -72,10 +76,10 @@ end @btime f_typed(); -# ### Consider using delayed `DataFrame` creation technique +# ### Considera usar la técnica de creación de `DataFrame` demorada function f1() - x = DataFrame(Float64, 10^4, 100) # we work with DataFrame directly + x = DataFrame(Float64, 10^4, 100) # Trabajamos con un DataFrame directamente for c in 1:ncol(x) d = x[c] for r in 1:nrow(x) @@ -94,29 +98,29 @@ function f2() end x[c] = d end - DataFrame(x) # we delay creation of DataFrame after we have our job done + DataFrame(x) # y demoramos al creación del DataFrame hasta que hayamos acabado nuestro trabaj end @btime f1(); @btime f2(); -# ### You can add rows to a `DataFrame` in place and it is fast +# ### Puedes agregar filas a un `DataFrame` in situ rápidamente 🐇 x = DataFrame(rand(10^6, 5)) y = DataFrame(transpose(1.0:5.0)) z = [1.0:5.0;] -@btime vcat($x, $y); # creates a new DataFrame - slow -@btime append!($x, $y); # in place - fast +@btime vcat($x, $y); # crear un nuevo DataFrame - lento 🐢 +@btime append!($x, $y); # in situ - rápido 🐇 -x = DataFrame(rand(10^6, 5)) # reset to the same starting point -@btime push!($x, $z); # add a single row in place - fastest +x = DataFrame(rand(10^6, 5)) # reseteamos al mismo punto inicial +@btime push!($x, $z); # agregar una sola fila - lo más rápido -# ### Allowing `missing` as well as `categorical` slows down computations +# ### Permitir datos `missing` y `categorical` alenta el cómputo using StatsBase -function test(data) # uses countmap function to test performance +function test(data) # usa la función countmap para medir performance println(eltype(data)) x = rand(data, 10^6) y = categorical(x) From 2f0ff6320ec03b4eb743c639dc73d3590c5025e6 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 12:09:25 -0500 Subject: [PATCH 15/24] spanish translation of 12_pitfalls.jl --- literate_notebooks/src-ES/12_pitfalls.jl | 37 +++++++++++++----------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/literate_notebooks/src-ES/12_pitfalls.jl b/literate_notebooks/src-ES/12_pitfalls.jl index 8eb5e79..c7b00db 100644 --- a/literate_notebooks/src-ES/12_pitfalls.jl +++ b/literate_notebooks/src-ES/12_pitfalls.jl @@ -1,43 +1,45 @@ # # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** +# (Traducción por Miguel Raz Guzmán Macedo, 18 de Abril de 2021) using DataFrames -# ## Possible pitfalls +# ## Posibles errors comunes #- -# ### Know what is copied when creating a `DataFrame` +# ### Hay que saber qué se copia cuando se crea un `DataFrame` x = DataFrame(rand(3, 5)) #- y = DataFrame(x) -x === y # no copyinng performed +x === y # no se hace ninguna copia #- y = copy(x) -x === y # not the same object +x === y # no es el mismo objeto #- -all(x[i] === y[i] for i in ncol(x)) # but the columns are the same +all(x[i] === y[i] for i in ncol(x)) # pero las columnas son las mismas #- -x = 1:3; y = [1, 2, 3]; df = DataFrame(x=x,y=y) # the same when creating arrays or assigning columns, except ranges +x = 1:3; y = [1, 2, 3]; df = DataFrame(x=x,y=y) # lo mismo sucedo cuando creamos arreglos o asignamos columnas, excepto por rangos #- -y === df[:y] # the same object +y === df[:y] # es el mismo objeto #- -typeof(x), typeof(df[:x]) # range is converted to a vector +typeof(x), typeof(df[:x]) # un rango se convierte en un vector -# ### Do not modify the parent of `GroupedDataFrame` +# ### No hay que modificar el arreglo original de `GroupedDataFrame` x = DataFrame(id=repeat([1,2], outer=3), x=1:6) g = groupby(x, :id) @@ -45,29 +47,30 @@ g = groupby(x, :id) #- x[1:3, 1]=[2,2,2] -g # well - it is wrong now, g is only a view +g # pues, está mal por ahora - `g` es sólo un `view` -# ### Remember that you can filter columns of a `DataFrame` using booleans +# ### Recuerda: peudes filtrar columnas de un `DataFrame` usando booleans srand(1) x = DataFrame(rand(5, 5)) #- -x[x[:x1] .< 0.25] # well - we have filtered columns not rows by accident as you can select columns using booleans +x[x[:x1] .< 0.25] # oops, filtramos columnas y no filas, por accidente, pues puedes seleccionar columnas usando booleanos #- -x[x[:x1] .< 0.25, :] # probably this is what we wanted +x[x[:x1] .< 0.25, :] # esto es probablemente es lo que queríamos -# ### Column selection for DataFrame creates aliases unless explicitly copied +# ### Seleccionar columnas de un DataFrame crea un alias si no se copia explícitamente x = DataFrame(a=1:3) x[:b] = x[1] # alias -x[:c] = x[:, 1] # also alias -x[:d] = x[1][:] # copy -x[:e] = copy(x[1]) # explicit copy +x[:c] = x[:, 1] # igual esalias +x[:d] = x[1][:] # copia +x[:e] = copy(x[1]) # copia explícita display(x) x[1,1] = 100 display(x) + From 669663c6cd6d3216cc77e050917bfe317fd6313b Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 12:33:41 -0500 Subject: [PATCH 16/24] spanish translation of 13_extras.jl --- literate_notebooks/src-ES/13_extras.jl | 88 ++++++++++++++------------ 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/literate_notebooks/src-ES/13_extras.jl b/literate_notebooks/src-ES/13_extras.jl index 5140a31..89dc1b2 100644 --- a/literate_notebooks/src-ES/13_extras.jl +++ b/literate_notebooks/src-ES/13_extras.jl @@ -1,56 +1,59 @@ # # Introduction to DataFrames +# # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 13, 2018** +# (Traducción de Miguel Raz Guzmán Macedo, 18 de Abril de 2021) using DataFrames -# ## Extras - selected functionalities of selected packages +# ## Extras - funcionalidades especiales de paqueterías selectas #- -# ### FreqTables: creating cross tabulations +# ### FreqTables: crear tabulaciones cruzadas using FreqTables df = DataFrame(a=rand('a':'d', 1000), b=rand(["x", "y", "z"], 1000)) -ft = freqtable(df, :a, :b) # observe that dimensions are sorted if possible +ft = freqtable(df, :a, :b) # nota: las dimensiones se ordenan si es posible #- -ft[1,1], ft['b', "z"] # you can index the result using numbers or names +ft[1,1], ft['b', "z"] # puedes indexar el resultado con números o nombres #- -prop(ft, 1) # getting proportions - 1 means we want to calculate them in rows (first dimension) +prop(ft, 1) # obtener tamaños - 1 significa que lo queremos calcular en filas (primera dimensión) + #- -prop(ft, 2) # and columns are normalized to 1.0 now +prop(ft, 2) # y las columnas se normalizan a 1.0 #- x = categorical(rand(1:3, 10)) -levels!(x, [3, 1, 2, 4]) # reordering levels and adding an extra level -freqtable(x) # order is preserved and not-used level is shown +levels!(x, [3, 1, 2, 4]) # reorganizar niveles y añadir un nivel extra +freqtable(x) # el orden se manteine y los nivelos no usados se meustran #- -freqtable([1,1,2,3,missing]) # by default missings are listed +freqtable([1,1,2,3,missing]) # por default los valores faltatnes (`missing`s) se muestran en lista #- -freqtable([1,1,2,3,missing], skipmissing=true) # but we can skip them +freqtable([1,1,2,3,missing], skipmissing=true) # pero nos los podemos saltar -# ### DataFramesMeta - working on `DataFrame` +# ### DataFramesMeta - trabajando en un `DataFrame` using DataFramesMeta df = DataFrame(x=1:8, y='a':'h', z=repeat([true,false], outer=4)) #- -@with(df, :x+:z) # expressions with columns of DataFrame +@with(df, :x+:z) # expresiones con columnas de DataFrame #- -@with df begin # you can define code blocks +@with df begin # puedes definir bloques de código a = :x[:z] b = :x[.!:z] :y + [a; b] @@ -58,41 +61,41 @@ end #- -a # @with creates hard scope so variables do not leak out +a # `@with` crea un `hard scope` (alcance léxico fuerte) y así las variables no se derraman al ambiente #- df2 = DataFrame(a = [:a, :b, :c]) -@with(df2, :a .== ^(:a)) # sometimes we want to work on raw Symbol, ^() escapes it +@with(df2, :a .== ^(:a)) # A veces queremos un `Symbol` a secas, usamos ^() para escapar esa secuencia #- df2 = DataFrame(x=1:3, y=4:6, z=7:9) -@with(df2, _I_(2:3)) # _I_(expression) is translated to df2[expression] +@with(df2, _I_(2:3)) # `_I_(expresión)` se traduce a `df2[expression]` #- -@where(df, :x .< 4, :z .== true) # very useful macro for filtering +@where(df, :x .< 4, :z .== true) # macro muy útil para filtrar #- -@select(df, :x, y = 2*:x, z=:y) # create a new DataFrame based on the old one +@select(df, :x, y = 2*:x, z=:y) # crea un nuevo DataFrame basado en el anterior #- -@transform(df, a=1, x = 2*:x, y=:x) # create a new DataFrame adding columns based on the old one +@transform(df, a=1, x = 2*:x, y=:x) # crea un DataFrame agregando columnas basado en el anterior #- -@transform(df, a=1, b=:a) # old DataFrame is used and :a is not present there +@transform(df, a=1, b=:a) # se usa el DataFrame anterior y `:a` no está presente ahí #- -@orderby(df, :z, -:x) # sorting into a new data frame, less powerful than sort, but lightweight +@orderby(df, :z, -:x) # ordenando los datos en un nuevo DataFrame, menos poderoso que `sort` pero mucho más ligero #- -@linq df |> # chaining of operations on DataFrame +@linq df |> # podemos encadenar operaciones sobre un DataFrame where(:x .< 5) |> orderby(:z) |> transform(x²=:x.^2) |> @@ -100,76 +103,77 @@ df2 = DataFrame(x=1:3, y=4:6, z=7:9) #- -f(df, col) = df[col] # you can define your own functions and put them in the chain +f(df, col) = df[col] # puees definir tus propias funcioens y ponerlas en una cadena (`chain` = cadena, como las de metal) @linq df |> where(:x .<= 4) |> f(:x) -# ### DataFramesMeta - working on grouped `DataFrame` +# ### DataFramesMeta - trabajando con `DataFrame` agrupado df = DataFrame(a = 1:12, b = repeat('a':'d', outer=3)) g = groupby(df, :b) #- -@by(df, :b, first=first(:a), last=last(:a), mean=mean(:a)) # more convinient than by from DataFrames +@by(df, :b, first=first(:a), last=last(:a), mean=mean(:a)) # más conveniente que `by` de `DataFrames.jl` #- -@based_on(g, first=first(:a), last=last(:a), mean=mean(:a)) # the same as by but on grouped DataFrame +@based_on(g, first=first(:a), last=last(:a), mean=mean(:a)) #lo mismo que `by` pero en un DataFrame agrupado #- -@where(g, mean(:a) > 6.5) # filter gropus on aggregate conditions +@where(g, mean(:a) > 6.5) # filtramos grupos con condiciones agregadas #- -@orderby(g, -sum(:a)) # order groups on aggregate conditions +@orderby(g, -sum(:a)) # ordenar grupos con condiciones agregadas #- -@transform(g, center = mean(:a), centered = :a - mean(:a)) # perform operations within a group and return ungroped DataFrame +@transform(g, center = mean(:a), centered = :a - mean(:a)) # aplicar operaciones dentro de un grupo y regresar un DataFrame no agrupado #- -DataFrame(g) # a nice convinience function not defined in DataFrames +DataFrame(g) # una función auxiliar bonita no definida en DataFrames.jl #- -@transform(g) # actually this is the same +@transform(g) # de hecho esto es lo mismo #- -@linq df |> groupby(:b) |> where(mean(:a) > 6.5) |> DataFrame # you can do chaining on grouped DataFrames as well +@linq df |> groupby(:b) |> where(mean(:a) > 6.5) |> DataFrame # puedes encadenar operaciones sobre DataFrames agrupado también -# ### DataFramesMeta - rowwise operations on `DataFrame` +# ### DataFramesMeta - operaciones por filas en un `DataFrame` df = DataFrame(a = 1:12, b = repeat(1:4, outer=3)) #- -## such conditions are often needed but are complex to write +## dichas condiciones suelen ser necesarios pero son demasiado complejas para escribirlas @transform(df, x = ifelse.((:a .> 6) .& (:b .== 4), "yes", "no")) #- -## one option is to use a function that works on a single observation and broadcast it +## una opciön es usar una función que se aplica sobre una sola observación y la broadcasteamos +# Broadcasting en el manual: https://docs.julialang.org/en/v1/manual/arrays/#Broadcasting myfun(a, b) = a > 6 && b == 4 ? "yes" : "no" @transform(df, x = myfun.(:a, :b)) #- -## or you can use @byrow! macro that allows you to process DataFrame rowwise +## o puedes usar el macro `@byrow` que permite procesar el DataFrame por filas @byrow! df begin @newcol x::Vector{String} :x = :a > 6 && :b == 4 ? "yes" : "no" end -# ### Visualizing data with StatPlots +# ### Visualizando datos con StatPlots -using StatPlots # you might need to setup Plots package and some plotting backend first +using StatPlots # Puede ser necesario instalar algunos paquetes de Plots.jl y el backend antes de proseguir #- -## we present only a minimal functionality of the package +## presentamos la funcionalidad mínima de este paquete, no nos da para cubrir todo lo que ofrece #- @@ -178,15 +182,15 @@ df = DataFrame(x = sort(randn(1000)), y=randn(1000), z = [fill("b", 500); fill(" #- -@df df plot(:x, :y, legend=:topleft, label="y(x)") # a most basic plot +@df df plot(:x, :y, legend=:topleft, label="y(x)") # la gráfica más básica #- -@df df density(:x, label="") # density plot +@df df density(:x, label="") # gráfica de densidad #- -@df df histogram(:y, label="y") # and a histogram +@df df histogram(:y, label="y") # un histograma #- From fb6a33c9e62ed8b99ee58fdff6e7d59a1079f2e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= Date: Sun, 18 Apr 2021 12:43:54 -0500 Subject: [PATCH 17/24] Update literate_notebooks/src-ES/01_constructors.jl Co-authored-by: pdeffebach <23196228+pdeffebach@users.noreply.github.com> --- literate_notebooks/src-ES/01_constructors.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/literate_notebooks/src-ES/01_constructors.jl b/literate_notebooks/src-ES/01_constructors.jl index 62ffa39..c09b7eb 100644 --- a/literate_notebooks/src-ES/01_constructors.jl +++ b/literate_notebooks/src-ES/01_constructors.jl @@ -12,7 +12,7 @@ using DataFrames # ### Constructores # -# En esta secciónn, verás distintas maneras de crear un `DataFrame` usando el constructor `DataFrame()`. +# En esta sección, verás distintas maneras de crear un `DataFrame` usando el constructor `DataFrame()`. # # Primero, creemos un DataFrame vacío. @@ -142,4 +142,3 @@ df = DataFrame(:a=>1, :a=>2, :a_1=>3) # No puedes pasar `makeunique` para permitir duplicados en este caso. df = DataFrame(a=1, a=2, makeunique=true) - From e2a8f257956eb0b30e98de99f5b1232d8057e799 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= Date: Sun, 18 Apr 2021 12:44:02 -0500 Subject: [PATCH 18/24] Update literate_notebooks/src-ES/01_constructors.jl Co-authored-by: pdeffebach <23196228+pdeffebach@users.noreply.github.com> --- literate_notebooks/src-ES/01_constructors.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/literate_notebooks/src-ES/01_constructors.jl b/literate_notebooks/src-ES/01_constructors.jl index c09b7eb..b1b68c5 100644 --- a/literate_notebooks/src-ES/01_constructors.jl +++ b/literate_notebooks/src-ES/01_constructors.jl @@ -22,7 +22,7 @@ DataFrame() # DataFrame vacío DataFrame(A=1:3, B=rand(3), C=randstring.([3,3,3])) -# Podemos creat el `DataFrame` de un diccionario, en cuyo caso las llaves del diccionario estarán ordenadas para crear las columnas del `DataFrame`. +# Podemos crear el `DataFrame` de un diccionario, en cuyo caso las llaves del diccionario estarán ordenadas para crear las columnas del `DataFrame`. x = Dict("A" => [1,2], "B" => [true, false], "C" => ['a', 'b']) DataFrame(x) From c862b3b6c7acbd252c941b52e84fda48b1098283 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= Date: Sun, 18 Apr 2021 12:44:28 -0500 Subject: [PATCH 19/24] Update literate_notebooks/src-ES/01_constructors.jl Co-authored-by: pdeffebach <23196228+pdeffebach@users.noreply.github.com> --- literate_notebooks/src-ES/01_constructors.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/literate_notebooks/src-ES/01_constructors.jl b/literate_notebooks/src-ES/01_constructors.jl index b1b68c5..099c051 100644 --- a/literate_notebooks/src-ES/01_constructors.jl +++ b/literate_notebooks/src-ES/01_constructors.jl @@ -90,7 +90,6 @@ z = copy(x) # ### Conversión a matrices # -# Let's start by creating a `DataFrame` with two rows and two columns. # Empecemos creando un `DataFrame` con dos filas y dos columnas. x = DataFrame(x=1:2, y=["A", "B"]) From 8a31c93866f2ab14f7b82a6a4f1ff6719dedab8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miguel=20Raz=20Guzm=C3=A1n=20Macedo?= Date: Sun, 18 Apr 2021 12:44:42 -0500 Subject: [PATCH 20/24] Update literate_notebooks/src-ES/01_constructors.jl Co-authored-by: pdeffebach <23196228+pdeffebach@users.noreply.github.com> --- literate_notebooks/src-ES/01_constructors.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/literate_notebooks/src-ES/01_constructors.jl b/literate_notebooks/src-ES/01_constructors.jl index 099c051..7ffa16c 100644 --- a/literate_notebooks/src-ES/01_constructors.jl +++ b/literate_notebooks/src-ES/01_constructors.jl @@ -127,7 +127,7 @@ Matrix(x) Matrix{Int}(x) -# ### Lidiando con nombres de columnas repetidos +# ### Lipiando con nombres de columnas repetidos # # Podemos pasar el keyword argument `makeunique` para permitir usar nombres duplicados (se desduplican) From b88e172244388c143425ce53d0b6f46a90e01950 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 12:53:07 -0500 Subject: [PATCH 21/24] spanish translation - add cheatsheets and language comparisons docs --- literate_notebooks/src-ES/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/literate_notebooks/src-ES/README.md b/literate_notebooks/src-ES/README.md index 4817c1c..083f852 100644 --- a/literate_notebooks/src-ES/README.md +++ b/literate_notebooks/src-ES/README.md @@ -7,6 +7,13 @@ Una breve introducción al uso de los [DataFrames](https://github.com/JuliaData/DataFrames.jl). +Nota: si ya sabes como usar datos tabulares en otros lenguajes, notoriamente `pandas` en Python o `dplyr` en R, y Stata, +este paquete te resultará muy familiar. Aún así hay detalles en donde difieron las funcionalidades principales, +y vale la pena leer los siguientes materiales para evitar confusión: + +- un `cheatsheet` [de Julia <-> Python <-> Matlab](https://cheatsheets.quantecon.org/) de Quantecon +- la documentación comparativa de DataFrames.jl [con los otros lenguajes](https://dataframes.juliadata.org/latest/man/comparisons/#Comparisons) + Este tutorial contiene una especificación de la versión del proyecto bajo el cual debería correr. Para preparar este ambiente, antes de usar los notebooks, hay que correr la siguiente línea en el folder del proyecto: From c0dd3c49e049f9af44c0b0ffd16ee94e99b8639b Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 13:06:31 -0500 Subject: [PATCH 22/24] spanish translation - missed intro in 03_missingvalues.jl --- literate_notebooks/src-ES/03_missingvalues.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/literate_notebooks/src-ES/03_missingvalues.jl b/literate_notebooks/src-ES/03_missingvalues.jl index 3ef9776..eb9bbbb 100644 --- a/literate_notebooks/src-ES/03_missingvalues.jl +++ b/literate_notebooks/src-ES/03_missingvalues.jl @@ -1,4 +1,3 @@ -# # Introduction to DataFrames # # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** # (Traducción de Miguel Raz Guzmán Macedo) From 96a326bd64930b04a26c5646d6f06b90389f7a02 Mon Sep 17 00:00:00 2001 From: miguel raz Date: Sun, 18 Apr 2021 17:37:52 -0500 Subject: [PATCH 23/24] spanish translation - add .toml files, fixup typos and word choices --- literate_notebooks/src-ES/02_basicinfo.jl | 5 +- literate_notebooks/src-ES/03_missingvalues.jl | 2 +- literate_notebooks/src-ES/04_loadsave.jl | 2 +- literate_notebooks/src-ES/06_rows.jl | 2 +- literate_notebooks/src-ES/07_factors.jl | 5 +- literate_notebooks/src-ES/08_joins.jl | 1 - literate_notebooks/src-ES/09_reshaping.jl | 4 +- literate_notebooks/src-ES/10_transforms.jl | 7 +- literate_notebooks/src-ES/11_performance.jl | 1 - literate_notebooks/src-ES/12_pitfalls.jl | 5 +- literate_notebooks/src-ES/13_extras.jl | 8 +- literate_notebooks/src-ES/Manifest.toml | 1227 +++++++++++++++++ literate_notebooks/src-ES/Project.toml | 22 + 13 files changed, 1267 insertions(+), 24 deletions(-) create mode 100644 literate_notebooks/src-ES/Manifest.toml create mode 100644 literate_notebooks/src-ES/Project.toml diff --git a/literate_notebooks/src-ES/02_basicinfo.jl b/literate_notebooks/src-ES/02_basicinfo.jl index 3801219..4c37b0d 100644 --- a/literate_notebooks/src-ES/02_basicinfo.jl +++ b/literate_notebooks/src-ES/02_basicinfo.jl @@ -1,4 +1,3 @@ -# # Introduction to DataFrames # # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), May 23, 2018** # Traducción por Miguel Raz, Abril 17, 2021 @@ -28,7 +27,7 @@ describe(x) showcols(x) -# `names` regresa el nombre de todas las columnas, +# `names` devuelve el nombre de todas las columnas, names(x) @@ -54,7 +53,7 @@ tail(y, 3) x[1], x[:A], x[:, 1] -# Para tomar una hilera de un DataFrame, lo indexamos como sigue +# Para tomar una fila de un DataFrame, lo indexamos como sigue x[1, :] diff --git a/literate_notebooks/src-ES/03_missingvalues.jl b/literate_notebooks/src-ES/03_missingvalues.jl index eb9bbbb..1659e06 100644 --- a/literate_notebooks/src-ES/03_missingvalues.jl +++ b/literate_notebooks/src-ES/03_missingvalues.jl @@ -49,7 +49,7 @@ map(x -> x(missing, 1), [+, - , *, /, div]) # part 2 map(x -> x([1,2,missing]), [minimum, maximum, extrema, mean, any, float]) # part 3 -# `skipmissing` regresa un iterador que salta valores faltantes. Podemos usar `collect` y `skipmissing` para crear un arreglo que excluye estos valores faltantes. +# `skipmissing` devuelve un iterador que salta valores faltantes. Podemos usar `collect` y `skipmissing` para crear un arreglo que excluye estos valores faltantes. collect(skipmissing([1, missing, 2, missing])) diff --git a/literate_notebooks/src-ES/04_loadsave.jl b/literate_notebooks/src-ES/04_loadsave.jl index 9607057..c934a6f 100644 --- a/literate_notebooks/src-ES/04_loadsave.jl +++ b/literate_notebooks/src-ES/04_loadsave.jl @@ -31,7 +31,7 @@ CSV.write("x.csv", x) print(read("x.csv", String)) -# También lo podemos cargar de regreso. `use_mmap=false` desabilita el uso de `memory mapping` para los archivos se puedan borrar en la misma sesión en Windows. +# También lo podemos cargar de vuelta. `use_mmap=false` desabilita el uso de `memory mapping` para los archivos se puedan borrar en la misma sesión en Windows. y = CSV.read("x.csv", use_mmap=false) diff --git a/literate_notebooks/src-ES/06_rows.jl b/literate_notebooks/src-ES/06_rows.jl index b67d333..2f95e7f 100644 --- a/literate_notebooks/src-ES/06_rows.jl +++ b/literate_notebooks/src-ES/06_rows.jl @@ -116,7 +116,7 @@ deleterows!(x, 7) # borrar 1 fila #- -deleterows!(x, 6:7) # borrar una colleción de filas +deleterows!(x, 6:7) # borrar una colección de filas #- diff --git a/literate_notebooks/src-ES/07_factors.jl b/literate_notebooks/src-ES/07_factors.jl index 441667b..e002957 100644 --- a/literate_notebooks/src-ES/07_factors.jl +++ b/literate_notebooks/src-ES/07_factors.jl @@ -1,4 +1,3 @@ -# # Introduction to DataFrames # # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** # (Traducción por Miguel Raz Guzmán, 18 de Abril de 2021) @@ -11,7 +10,7 @@ using DataFrames # ### Constructores -x = categorical(["A", "B", "B", "C"]) # sin orded +x = categorical(["A", "B", "B", "C"]) # sin orden #- @@ -217,7 +216,7 @@ df = DataFrame(x = 1:3, y = 'a':'c', z = ["a","b","c"]) #- -categorical!(df) # conviertir todos las columnas `eltype(AbstractString)` a columnas categóricas +categorical!(df) # convierte todas las columnas `eltype(AbstractString)` a columnas categóricas #- diff --git a/literate_notebooks/src-ES/08_joins.jl b/literate_notebooks/src-ES/08_joins.jl index 28036f6..205495a 100644 --- a/literate_notebooks/src-ES/08_joins.jl +++ b/literate_notebooks/src-ES/08_joins.jl @@ -1,4 +1,3 @@ -# # Introduction to DataFrames # # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2017** # (Traducción por Miguel Raz Guzmán Macedo, 18 de Abril 2021) diff --git a/literate_notebooks/src-ES/09_reshaping.jl b/literate_notebooks/src-ES/09_reshaping.jl index 0aa4fbe..ff7ae68 100644 --- a/literate_notebooks/src-ES/09_reshaping.jl +++ b/literate_notebooks/src-ES/09_reshaping.jl @@ -14,7 +14,7 @@ x = DataFrame(id=[1,2,3,4], id2=[1,1,2,2], M1=[11,12,13,14], M2=[111,112,113,114 #- -melt(x, :id, [:M1, :M2]) # primero pasamos las variables de identificación y luego las medioms; `meltdf` (melt = derretir) crea un `view` +melt(x, :id, [:M1, :M2]) # primero pasamos las variables de identificación y luego las medimos; `meltdf` (melt = derretir) crea un `view` #- @@ -59,7 +59,7 @@ df = DataFrame(rand(3,2)) df[:key] = [1,1,1] mdf = melt(df) # las llaves duplicadas se aceptan silenciosamente -# ### Largo a ancho +# ### De largo a ancho x = DataFrame(id = [1,1,1], id2=['a','b','c'], a1 = rand(3), a2 = rand(3)) diff --git a/literate_notebooks/src-ES/10_transforms.jl b/literate_notebooks/src-ES/10_transforms.jl index 4cca725..d669a1b 100644 --- a/literate_notebooks/src-ES/10_transforms.jl +++ b/literate_notebooks/src-ES/10_transforms.jl @@ -1,4 +1,3 @@ -# # Introduction to DataFrames # # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** # (Traducción por Miguel Raz Guzmán Macedo, 18 de abril de 2021) @@ -62,11 +61,11 @@ x = DataFrame(rand(3, 5)) #- -map(mean, eachcol(x)) # mapea una función a cada columna y regresa un DataFrame +map(mean, eachcol(x)) # mapea una función a cada columna y devuelve un DataFrame #- -foreach(c -> println(c[1], ": ", mean(c[2])), eachcol(x)) # una iteración a secas regresa una tupla con nombres columnares y valores +foreach(c -> println(c[1], ": ", mean(c[2])), eachcol(x)) # una iteración a secas devuelve una tupla con nombres columnares y valores #- @@ -79,5 +78,5 @@ colwise(mean,groupby(x, :id)) # y funciona en un `GroupedDataFrame` #- -map(r -> r[:x1]/r[:x2], eachrow(x)) # ahora el valor regresado es un `DataFrameRow` el cual funciona similar a un `DataFrame` de una sola fila +map(r -> r[:x1]/r[:x2], eachrow(x)) # ahora el valor devuelto es un `DataFrameRow` el cual funciona similar a un `DataFrame` de una sola fila diff --git a/literate_notebooks/src-ES/11_performance.jl b/literate_notebooks/src-ES/11_performance.jl index 95d64a2..9dc1c2a 100644 --- a/literate_notebooks/src-ES/11_performance.jl +++ b/literate_notebooks/src-ES/11_performance.jl @@ -1,4 +1,3 @@ -# # Introduction to DataFrames # # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** # (Traducción por Miguel Raz Guzmán Macedo, 18 de abril de 2021) diff --git a/literate_notebooks/src-ES/12_pitfalls.jl b/literate_notebooks/src-ES/12_pitfalls.jl index c7b00db..4bec234 100644 --- a/literate_notebooks/src-ES/12_pitfalls.jl +++ b/literate_notebooks/src-ES/12_pitfalls.jl @@ -1,11 +1,10 @@ -# # Introduction to DataFrames # # Introducción a DataFrames # **[Bogumił Kamiński](http://bogumilkaminski.pl/about/), Apr 21, 2018** # (Traducción por Miguel Raz Guzmán Macedo, 18 de Abril de 2021) using DataFrames -# ## Posibles errors comunes +# ## Posibles errores comunes #- @@ -49,7 +48,7 @@ g = groupby(x, :id) x[1:3, 1]=[2,2,2] g # pues, está mal por ahora - `g` es sólo un `view` -# ### Recuerda: peudes filtrar columnas de un `DataFrame` usando booleans +# ### Recuerda: puedes filtrar columnas de un `DataFrame` usando booleans srand(1) x = DataFrame(rand(5, 5)) diff --git a/literate_notebooks/src-ES/13_extras.jl b/literate_notebooks/src-ES/13_extras.jl index 89dc1b2..9748df3 100644 --- a/literate_notebooks/src-ES/13_extras.jl +++ b/literate_notebooks/src-ES/13_extras.jl @@ -32,11 +32,11 @@ prop(ft, 2) # y las columnas se normalizan a 1.0 x = categorical(rand(1:3, 10)) levels!(x, [3, 1, 2, 4]) # reorganizar niveles y añadir un nivel extra -freqtable(x) # el orden se manteine y los nivelos no usados se meustran +freqtable(x) # el orden se mantiene y los niveles no usados se muestran #- -freqtable([1,1,2,3,missing]) # por default los valores faltatnes (`missing`s) se muestran en lista +freqtable([1,1,2,3,missing]) # por default los valores faltantes (`missing`s) se muestran como lista #- @@ -103,7 +103,7 @@ df2 = DataFrame(x=1:3, y=4:6, z=7:9) #- -f(df, col) = df[col] # puees definir tus propias funcioens y ponerlas en una cadena (`chain` = cadena, como las de metal) +f(df, col) = df[col] # puedes definir tus propias funcioens y ponerlas en una cadena (`chain` = cadena, como las de metal) @linq df |> where(:x .<= 4) |> f(:x) # ### DataFramesMeta - trabajando con `DataFrame` agrupado @@ -149,7 +149,7 @@ df = DataFrame(a = 1:12, b = repeat(1:4, outer=3)) #- -## dichas condiciones suelen ser necesarios pero son demasiado complejas para escribirlas +## dichas condiciones suelen ser necesarias pero son demasiado complejas para escribirlas @transform(df, x = ifelse.((:a .> 6) .& (:b .== 4), "yes", "no")) #- diff --git a/literate_notebooks/src-ES/Manifest.toml b/literate_notebooks/src-ES/Manifest.toml new file mode 100644 index 0000000..03693db --- /dev/null +++ b/literate_notebooks/src-ES/Manifest.toml @@ -0,0 +1,1227 @@ +# This file is machine-generated - editing it directly is not advised + +[[AbstractFFTs]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "051c95d6836228d120f5f4b984dd5aba1624f716" +uuid = "621f4979-c628-5d54-868e-fcf4e3e8185c" +version = "0.5.0" + +[[Adapt]] +deps = ["LinearAlgebra"] +git-tree-sha1 = "42c42f2221906892ceb765dbcb1a51deeffd86d7" +uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +version = "2.3.0" + +[[Arpack]] +deps = ["Arpack_jll", "Libdl", "LinearAlgebra"] +git-tree-sha1 = "2ff92b71ba1747c5fdd541f8fc87736d82f40ec9" +uuid = "7d9fca2a-8960-54d3-9f78-7d1dccf2cb97" +version = "0.4.0" + +[[Arpack_jll]] +deps = ["Libdl", "OpenBLAS_jll", "Pkg"] +git-tree-sha1 = "e214a9b9bd1b4e1b4f15b22c0994862b66af7ff7" +uuid = "68821587-b530-5797-8361-c406ea357684" +version = "3.5.0+3" + +[[Arrow]] +deps = ["CodecLz4", "CodecZstd", "DataAPI", "Dates", "Mmap", "PooledArrays", "SentinelArrays", "Tables", "TimeZones"] +git-tree-sha1 = "e2196f539c141a98d66dc50145b67325b7842b1f" +uuid = "69666777-d1a9-59fb-9406-91d4454c9d45" +version = "1.0.1" + +[[Artifacts]] +deps = ["Pkg"] +git-tree-sha1 = "c30985d8821e0cd73870b17b0ed0ce6dc44cb744" +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" +version = "1.3.0" + +[[AxisAlgorithms]] +deps = ["LinearAlgebra", "Random", "SparseArrays", "WoodburyMatrices"] +git-tree-sha1 = "a4d07a1c313392a77042855df46c5f534076fab9" +uuid = "13072b0f-2c55-5437-9ae7-d433b7a33950" +version = "1.0.0" + +[[BSON]] +git-tree-sha1 = "dd36d7cf3d185eeaaf64db902c15174b22f5dafb" +uuid = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" +version = "0.2.6" + +[[Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[BenchmarkTools]] +deps = ["JSON", "Logging", "Printf", "Statistics", "UUIDs"] +git-tree-sha1 = "9e62e66db34540a0c919d72172cc2f642ac71260" +uuid = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +version = "0.5.0" + +[[Blosc]] +deps = ["Blosc_jll"] +git-tree-sha1 = "84cf7d0f8fd46ca6f1b3e0305b4b4a37afe50fd6" +uuid = "a74b3585-a348-5f62-a45c-50e91977d574" +version = "0.7.0" + +[[Blosc_jll]] +deps = ["Libdl", "Lz4_jll", "Pkg", "Zlib_jll", "Zstd_jll"] +git-tree-sha1 = "aa9ef39b54a168c3df1b2911e7797e4feee50fbe" +uuid = "0b7ba130-8d10-5ba8-a3d6-c5182647fed9" +version = "1.14.3+1" + +[[BufferedStreams]] +deps = ["Compat", "Test"] +git-tree-sha1 = "5d55b9486590fdda5905c275bb21ce1f0754020f" +uuid = "e1450e63-4bb3-523b-b2a4-4ffa8c0fd77d" +version = "1.0.0" + +[[Bzip2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "c3598e525718abcc440f69cc6d5f60dda0a1b61e" +uuid = "6e34b625-4abd-537c-b88f-471c36dfa7a0" +version = "1.0.6+5" + +[[CSV]] +deps = ["Dates", "Mmap", "Parsers", "PooledArrays", "SentinelArrays", "Tables", "Unicode"] +git-tree-sha1 = "290a56b2448024a1501834ee8b7d5d7004bc5ad3" +uuid = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +version = "0.8.2" + +[[Cairo_jll]] +deps = ["Artifacts", "Bzip2_jll", "Fontconfig_jll", "FreeType2_jll", "Glib_jll", "JLLWrappers", "LZO_jll", "Libdl", "Pixman_jll", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll", "Zlib_jll", "libpng_jll"] +git-tree-sha1 = "e2f47f6d8337369411569fd45ae5753ca10394c6" +uuid = "83423d85-b0ee-5818-9007-b63ccbeb887a" +version = "1.16.0+6" + +[[CategoricalArrays]] +deps = ["DataAPI", "Future", "JSON", "Missings", "Printf", "Statistics", "StructTypes", "Unicode"] +git-tree-sha1 = "5861101791fa76fafe8dddefd70ffbfe4e33ecae" +uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" +version = "0.9.0" + +[[Clustering]] +deps = ["Distances", "LinearAlgebra", "NearestNeighbors", "Printf", "SparseArrays", "Statistics", "StatsBase"] +git-tree-sha1 = "75479b7df4167267d75294d14b58244695beb2ac" +uuid = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5" +version = "0.14.2" + +[[CodecLz4]] +deps = ["Lz4_jll", "TranscodingStreams"] +git-tree-sha1 = "59fe0cb37784288d6b9f1baebddbf75457395d40" +uuid = "5ba52731-8f18-5e0d-9241-30f10d1ec561" +version = "0.4.0" + +[[CodecZlib]] +deps = ["TranscodingStreams", "Zlib_jll"] +git-tree-sha1 = "ded953804d019afa9a3f98981d99b33e3db7b6da" +uuid = "944b1d66-785c-5afd-91f1-9de20f533193" +version = "0.7.0" + +[[CodecZstd]] +deps = ["TranscodingStreams", "Zstd_jll"] +git-tree-sha1 = "d19cd9ae79ef31774151637492291d75194fc5fa" +uuid = "6b39b394-51ab-5f42-8807-6242bab2b4c2" +version = "0.7.0" + +[[ColorSchemes]] +deps = ["ColorTypes", "Colors", "FixedPointNumbers", "Random", "StaticArrays"] +git-tree-sha1 = "5d472aa8908568bc198564db06983913a6c2c8e7" +uuid = "35d6a980-a343-548e-a6ea-1d62b119f2f4" +version = "3.10.1" + +[[ColorTypes]] +deps = ["FixedPointNumbers", "Random"] +git-tree-sha1 = "4bffea7ed1a9f0f3d1a131bbcd4b925548d75288" +uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f" +version = "0.10.9" + +[[Colors]] +deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Reexport"] +git-tree-sha1 = "008d6bc68dea6beb6303fdc37188cb557391ebf2" +uuid = "5ae59095-9a9b-59fe-a467-6f913c188581" +version = "0.12.4" + +[[Combinatorics]] +git-tree-sha1 = "08c8b6831dc00bfea825826be0bc8336fc369860" +uuid = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" +version = "1.0.2" + +[[Compat]] +deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "SHA", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"] +git-tree-sha1 = "a706ff10f1cd8dab94f59fd09c0e657db8e77ff0" +uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" +version = "3.23.0" + +[[CompilerSupportLibraries_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "8e695f735fca77e9708e795eda62afdb869cbb70" +uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" +version = "0.3.4+0" + +[[Conda]] +deps = ["JSON", "VersionParsing"] +git-tree-sha1 = "c0647249d785f1d5139c0cc96db8f6b32f7ec416" +uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d" +version = "1.5.0" + +[[Contour]] +deps = ["StaticArrays"] +git-tree-sha1 = "0d128f9c2d9560349dc46f60c42036e244271d72" +uuid = "d38c429a-6771-53c6-b99e-75d170b6e991" +version = "0.5.6" + +[[Crayons]] +git-tree-sha1 = "3f71217b538d7aaee0b69ab47d9b7724ca8afa0d" +uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" +version = "4.0.4" + +[[DataAPI]] +git-tree-sha1 = "ad84f52c0b8f05aa20839484dbaf01690b41ff84" +uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" +version = "1.4.0" + +[[DataFrames]] +deps = ["CategoricalArrays", "Compat", "DataAPI", "Future", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrettyTables", "Printf", "REPL", "Reexport", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] +git-tree-sha1 = "20159837c2e5e196793a313cd700b8199fd8f985" +uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +version = "0.22.1" + +[[DataFramesMeta]] +deps = ["DataFrames", "Reexport"] +git-tree-sha1 = "d2b8f08f3b84ba53321d5609a622ad9f61998a6a" +uuid = "1313f7d8-7da2-5740-9ea0-a2ca25f37964" +version = "0.6.0" + +[[DataStructures]] +deps = ["Compat", "InteractiveUtils", "OrderedCollections"] +git-tree-sha1 = "fb0aa371da91c1ff9dc7fbed6122d3e411420b9c" +uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" +version = "0.18.8" + +[[DataValueInterfaces]] +git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" +uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" +version = "1.0.0" + +[[DataValues]] +deps = ["DataValueInterfaces", "Dates"] +git-tree-sha1 = "d88a19299eba280a6d062e135a43f00323ae70bf" +uuid = "e7dc6d0d-1eca-5fa6-8ad6-5aecde8b7ea5" +version = "0.4.13" + +[[Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[DelimitedFiles]] +deps = ["Mmap"] +uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab" + +[[Distances]] +deps = ["LinearAlgebra", "Statistics"] +git-tree-sha1 = "e8b13ba5f166e11df2de6fc283e5db7864245df0" +uuid = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" +version = "0.10.0" + +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + +[[Distributions]] +deps = ["FillArrays", "LinearAlgebra", "PDMats", "Printf", "QuadGK", "Random", "SparseArrays", "SpecialFunctions", "StaticArrays", "Statistics", "StatsBase", "StatsFuns"] +git-tree-sha1 = "6493eec6bfb1e578cff879b66844807e3625c83c" +uuid = "31c24e10-a181-5473-b8eb-7969acd0382f" +version = "0.24.4" + +[[DocStringExtensions]] +deps = ["LibGit2", "Markdown", "Pkg", "Test"] +git-tree-sha1 = "50ddf44c53698f5e784bbebb3f4b21c5807401b1" +uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" +version = "0.8.3" + +[[EarCut_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "92d8f9f208637e8d2d28c664051a00569c01493d" +uuid = "5ae413db-bbd1-5e63-b57d-d24a61df00f5" +version = "2.1.5+1" + +[[Expat_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1402e52fcda25064f51c77a9655ce8680b76acf0" +uuid = "2e619515-83b5-522b-bb60-26c02a35a201" +version = "2.2.7+6" + +[[ExprTools]] +git-tree-sha1 = "10407a39b87f29d47ebaca8edbc75d7c302ff93e" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.3" + +[[EzXML]] +deps = ["Printf", "XML2_jll"] +git-tree-sha1 = "0fa3b52a04a4e210aeb1626def9c90df3ae65268" +uuid = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" +version = "1.1.0" + +[[FFMPEG]] +deps = ["FFMPEG_jll", "x264_jll"] +git-tree-sha1 = "9a73ffdc375be61b0e4516d83d880b265366fe1f" +uuid = "c87230d0-a227-11e9-1b43-d7ebe4e7570a" +version = "0.4.0" + +[[FFMPEG_jll]] +deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "JLLWrappers", "LAME_jll", "LibVPX_jll", "Libdl", "Ogg_jll", "OpenSSL_jll", "Opus_jll", "Pkg", "Zlib_jll", "libass_jll", "libfdk_aac_jll", "libvorbis_jll", "x264_jll", "x265_jll"] +git-tree-sha1 = "3cc57ad0a213808473eafef4845a74766242e05f" +uuid = "b22a6f82-2f65-5046-a5b2-351ab43fb4e5" +version = "4.3.1+4" + +[[FFTW]] +deps = ["AbstractFFTs", "FFTW_jll", "IntelOpenMP_jll", "Libdl", "LinearAlgebra", "MKL_jll", "Reexport"] +git-tree-sha1 = "8b7c16b56936047ca41bf25effa137ae0b381ae8" +uuid = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" +version = "1.2.4" + +[[FFTW_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "f10c3009373a2d5c4349b8a2932d8accb892892d" +uuid = "f5851436-0d7a-5f13-b9de-f02708fd171a" +version = "3.3.9+6" + +[[FileIO]] +deps = ["Pkg"] +git-tree-sha1 = "cad2e71389ecb2f4480e0de74faab04af13d7929" +uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" +version = "1.4.4" + +[[FilePathsBase]] +deps = ["Dates", "Mmap", "Printf", "Test", "UUIDs"] +git-tree-sha1 = "eea043eb9e9087e53815e9587e9106027c3c6b14" +uuid = "48062228-2e41-5def-b9a4-89aafe57970f" +version = "0.9.5" + +[[FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" + +[[FillArrays]] +deps = ["LinearAlgebra", "Random", "SparseArrays"] +git-tree-sha1 = "c1cf9e87a5c45f0c05dc31ae95757f706e70865a" +uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" +version = "0.10.1" + +[[FixedPointNumbers]] +deps = ["Statistics"] +git-tree-sha1 = "335bfdceacc84c5cdf16aadc768aa5ddfc5383cc" +uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93" +version = "0.8.4" + +[[Fontconfig_jll]] +deps = ["Artifacts", "Bzip2_jll", "Expat_jll", "FreeType2_jll", "JLLWrappers", "Libdl", "Libuuid_jll", "Pkg", "Zlib_jll"] +git-tree-sha1 = "35895cf184ceaab11fd778b4590144034a167a2f" +uuid = "a3f928ae-7b40-5064-980b-68af3947d34b" +version = "2.13.1+14" + +[[Formatting]] +deps = ["Printf"] +git-tree-sha1 = "a0c901c29c0e7c763342751c0a94211d56c0de5c" +uuid = "59287772-0a20-5a39-b81b-1366585eb4c0" +version = "0.4.1" + +[[FreeType2_jll]] +deps = ["Artifacts", "Bzip2_jll", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "cbd58c9deb1d304f5a245a0b7eb841a2560cfec6" +uuid = "d7e528f0-a631-5988-bf34-fe36492bcfd7" +version = "2.10.1+5" + +[[FreqTables]] +deps = ["CategoricalArrays", "Missings", "NamedArrays", "Tables"] +git-tree-sha1 = "3adc3eefa0cd2042f2513240f95b36c8ddd0495d" +uuid = "da1fdf0e-e0ff-5433-a45f-9bb5ff651cb1" +version = "0.4.2" + +[[FriBidi_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "0d20aed5b14dd4c9a2453c1b601d08e1149679cc" +uuid = "559328eb-81f9-559d-9380-de523a88c83c" +version = "1.0.5+6" + +[[Future]] +deps = ["Random"] +uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" + +[[GLFW_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libglvnd_jll", "Pkg", "Xorg_libXcursor_jll", "Xorg_libXi_jll", "Xorg_libXinerama_jll", "Xorg_libXrandr_jll"] +git-tree-sha1 = "a1bbf700b5388bffc3d882f4f4d625cf1c714fd7" +uuid = "0656b61e-2033-5cc2-a64a-77c0f6c09b89" +version = "3.3.2+1" + +[[GR]] +deps = ["Base64", "DelimitedFiles", "GR_jll", "HTTP", "JSON", "LinearAlgebra", "Pkg", "Printf", "Random", "Serialization", "Sockets", "Test", "UUIDs"] +git-tree-sha1 = "b90b826782cb3ac5b7a7f41b3fd0113180257ed4" +uuid = "28b8d3ca-fb5f-59d9-8090-bfdbd6d07a71" +version = "0.53.0" + +[[GR_jll]] +deps = ["Bzip2_jll", "Cairo_jll", "FFMPEG_jll", "Fontconfig_jll", "GLFW_jll", "JpegTurbo_jll", "Libdl", "Libtiff_jll", "Pixman_jll", "Pkg", "Qhull_jll", "Qt_jll", "Zlib_jll", "libpng_jll"] +git-tree-sha1 = "daaccb414719ae63625b9b5e0eb4b1ec5b194590" +uuid = "d2c73de3-f751-5644-a686-071e5b155ba9" +version = "0.52.0+0" + +[[GeometryBasics]] +deps = ["EarCut_jll", "IterTools", "LinearAlgebra", "StaticArrays", "StructArrays", "Tables"] +git-tree-sha1 = "876a906eab3be990fdcbfe1e43bb3a76f4776f72" +uuid = "5c1252a2-5f33-56bf-86c9-59e7332b4326" +version = "0.3.3" + +[[Gettext_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "XML2_jll"] +git-tree-sha1 = "8c14294a079216000a0bdca5ec5a447f073ddc9d" +uuid = "78b55507-aeef-58d4-861c-77aaff3498b1" +version = "0.20.1+7" + +[[Glib_jll]] +deps = ["Artifacts", "Gettext_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Libiconv_jll", "Libmount_jll", "PCRE_jll", "Pkg", "Zlib_jll"] +git-tree-sha1 = "04690cc5008b38ecbdfede949220bc7d9ba26397" +uuid = "7746bdde-850d-59dc-9ae8-88ece973131d" +version = "2.59.0+4" + +[[Grisu]] +git-tree-sha1 = "03d381f65183cb2d0af8b3425fde97263ce9a995" +uuid = "42e2da0e-8278-4e71-bc24-59509adca0fe" +version = "1.0.0" + +[[HTTP]] +deps = ["Base64", "Dates", "IniFile", "MbedTLS", "Sockets"] +git-tree-sha1 = "c7ec02c4c6a039a98a15f955462cd7aea5df4508" +uuid = "cd3eb016-35fb-5094-929b-558a96fad6f3" +version = "0.8.19" + +[[IJulia]] +deps = ["Base64", "Conda", "Dates", "InteractiveUtils", "JSON", "Markdown", "MbedTLS", "Pkg", "Printf", "REPL", "Random", "SoftGlobalScope", "Test", "UUIDs", "ZMQ"] +git-tree-sha1 = "68e1792f3ca9a0df3b4e59d03a3aca828726917e" +uuid = "7073ff75-c697-5162-941a-fcdaad2a7d2a" +version = "1.23.0" + +[[IniFile]] +deps = ["Test"] +git-tree-sha1 = "098e4d2c533924c921f9f9847274f2ad89e018b8" +uuid = "83e8ac13-25f8-5344-8a64-a9f2b223428f" +version = "0.5.0" + +[[IntelOpenMP_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "fb8e1c7a5594ba56f9011310790e03b5384998d6" +uuid = "1d5cc7b8-4909-519e-a0f8-d0f5ad9712d0" +version = "2018.0.3+0" + +[[InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[Interpolations]] +deps = ["AxisAlgorithms", "LinearAlgebra", "OffsetArrays", "Random", "Ratios", "SharedArrays", "SparseArrays", "StaticArrays", "WoodburyMatrices"] +git-tree-sha1 = "d2ff0813f0f110918db2537201686575fcf8d345" +uuid = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" +version = "0.13.0" + +[[InvertedIndices]] +deps = ["Test"] +git-tree-sha1 = "15732c475062348b0165684ffe28e85ea8396afc" +uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" +version = "1.0.0" + +[[IterTools]] +git-tree-sha1 = "05110a2ab1fc5f932622ffea2a003221f4782c18" +uuid = "c8e1da08-722c-5040-9ed9-7db0dc04731e" +version = "1.3.0" + +[[IteratorInterfaceExtensions]] +git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" +uuid = "82899510-4779-5014-852e-03e436cf321d" +version = "1.0.0" + +[[JDF]] +deps = ["Blosc", "BufferedStreams", "CategoricalArrays", "DataAPI", "DataFrames", "Missings", "PooledArrays", "Serialization", "StatsBase", "Tables", "TimeZones", "WeakRefStrings"] +git-tree-sha1 = "77fcb0f3f55378587bf33b2560417104415f3b9d" +uuid = "babc3d20-cd49-4f60-a736-a8f9c08892d3" +version = "0.2.20" + +[[JLLWrappers]] +git-tree-sha1 = "c70593677bbf2c3ccab4f7500d0f4dacfff7b75c" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.1.3" + +[[JLSO]] +deps = ["BSON", "CodecZlib", "FilePathsBase", "Memento", "Pkg", "Serialization"] +git-tree-sha1 = "85124b548bf4c2bb59284d353aa09ffc224d761f" +uuid = "9da8a3cd-07a3-59c0-a743-3fdc52c30d11" +version = "2.4.0" + +[[JSON]] +deps = ["Dates", "Mmap", "Parsers", "Unicode"] +git-tree-sha1 = "81690084b6198a2e1da36fcfda16eeca9f9f24e4" +uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +version = "0.21.1" + +[[JSON3]] +deps = ["Dates", "Mmap", "Parsers", "StructTypes", "UUIDs"] +git-tree-sha1 = "961ef1c3e5c8a595d5bec270a9007429ef12ed10" +uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" +version = "1.5.1" + +[[JSONTables]] +deps = ["JSON3", "StructTypes", "Tables"] +git-tree-sha1 = "15ffb1561865803d7a52ed6714408647c9710af6" +uuid = "b9914132-a727-11e9-1322-f18e41205b0b" +version = "1.0.0" + +[[JpegTurbo_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9aff0587d9603ea0de2c6f6300d9f9492bbefbd3" +uuid = "aacddb02-875f-59d6-b918-886e6ef4fbf8" +version = "2.0.1+3" + +[[KernelDensity]] +deps = ["Distributions", "DocStringExtensions", "FFTW", "Interpolations", "StatsBase"] +git-tree-sha1 = "09aeec87bdc9c1fa70d0b508dfa94a21acd280d9" +uuid = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b" +version = "0.6.2" + +[[LAME_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "df381151e871f41ee86cee4f5f6fd598b8a68826" +uuid = "c1c5ebd0-6772-5130-a774-d5fcae4a789d" +version = "3.100.0+3" + +[[LZO_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "f128cd6cd05ffd6d3df0523ed99b90ff6f9b349a" +uuid = "dd4b983a-f0e5-5f8d-a1b7-129d4a5fb1ac" +version = "2.10.0+3" + +[[LaTeXStrings]] +git-tree-sha1 = "c7aebfecb1a60d59c0fe023a68ec947a208b1e6b" +uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" +version = "1.2.0" + +[[Latexify]] +deps = ["Formatting", "InteractiveUtils", "LaTeXStrings", "MacroTools", "Markdown", "Printf", "Requires"] +git-tree-sha1 = "8771ad2b1464aa6188899ca0c3e432341e35f96a" +uuid = "23fbe1c1-3f47-55db-b15f-69d7ec21a316" +version = "0.14.5" + +[[LibGit2]] +deps = ["Printf"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[LibVPX_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "85fcc80c3052be96619affa2fe2e6d2da3908e11" +uuid = "dd192d2f-8180-539f-9fb4-cc70b1dcf69a" +version = "1.9.0+1" + +[[Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[Libffi_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a2cd088a88c0d37eef7d209fd3d8712febce0d90" +uuid = "e9f186c6-92d2-5b65-8a66-fee21dc1b490" +version = "3.2.1+4" + +[[Libgcrypt_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgpg_error_jll", "Pkg"] +git-tree-sha1 = "b391a18ab1170a2e568f9fb8d83bc7c780cb9999" +uuid = "d4300ac3-e22c-5743-9152-c294e39db1e4" +version = "1.8.5+4" + +[[Libglvnd_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll", "Xorg_libXext_jll"] +git-tree-sha1 = "7739f837d6447403596a75d19ed01fd08d6f56bf" +uuid = "7e76a0d4-f3c7-5321-8279-8d96eeed0f29" +version = "1.3.0+3" + +[[Libgpg_error_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "ec7f2e8ad5c9fa99fc773376cdbc86d9a5a23cb7" +uuid = "7add5ba3-2f88-524e-9cd5-f83b8a55f7b8" +version = "1.36.0+3" + +[[Libiconv_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "8e924324b2e9275a51407a4e06deb3455b1e359f" +uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" +version = "1.16.0+7" + +[[Libmount_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "51ad0c01c94c1ce48d5cad629425035ad030bfd5" +uuid = "4b2f31a3-9ecc-558c-b454-b3730dcb73e9" +version = "2.34.0+3" + +[[Libtiff_jll]] +deps = ["Artifacts", "JLLWrappers", "JpegTurbo_jll", "Libdl", "Pkg", "Zlib_jll", "Zstd_jll"] +git-tree-sha1 = "291dd857901f94d683973cdf679984cdf73b56d0" +uuid = "89763e89-9b03-5906-acba-b20f662cd828" +version = "4.1.0+2" + +[[Libuuid_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "f879ae9edbaa2c74c922e8b85bb83cc84ea1450b" +uuid = "38a345b3-de98-5d2b-a5d3-14cd9215e700" +version = "2.34.0+7" + +[[LinearAlgebra]] +deps = ["Libdl"] +uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" + +[[Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[Lz4_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "51b1db0732bbdcfabb60e36095cc3ed9c0016932" +uuid = "5ced341a-0733-55b8-9ab6-a4889d929147" +version = "1.9.2+2" + +[[MKL_jll]] +deps = ["IntelOpenMP_jll", "Libdl", "Pkg"] +git-tree-sha1 = "eb540ede3aabb8284cb482aa41d00d6ca850b1f8" +uuid = "856f044c-d86e-5d09-b602-aeab76dc8ba7" +version = "2020.2.254+0" + +[[MacroTools]] +deps = ["Markdown", "Random"] +git-tree-sha1 = "6a8a2a625ab0dea913aba95c11370589e0239ff0" +uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" +version = "0.5.6" + +[[Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[MbedTLS]] +deps = ["Dates", "MbedTLS_jll", "Random", "Sockets"] +git-tree-sha1 = "1c38e51c3d08ef2278062ebceade0e46cefc96fe" +uuid = "739be429-bea8-5141-9913-cc70e7f3736d" +version = "1.0.3" + +[[MbedTLS_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "0eef589dd1c26a3ac9d753fe1a8bcad63f956fa6" +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.16.8+1" + +[[Measures]] +git-tree-sha1 = "e498ddeee6f9fdb4551ce855a46f54dbd900245f" +uuid = "442fdcdd-2543-5da2-b0f3-8c86c306513e" +version = "0.3.1" + +[[Memento]] +deps = ["Dates", "Distributed", "JSON", "Serialization", "Sockets", "Syslogs", "Test", "TimeZones", "UUIDs"] +git-tree-sha1 = "d6dfb54d7e8a9b4a2b1773acf7275a4f607906b2" +uuid = "f28f55f0-a522-5efc-85c2-fe41dfb9b2d9" +version = "1.1.2" + +[[Missings]] +deps = ["DataAPI"] +git-tree-sha1 = "ed61674a0864832495ffe0a7e889c0da76b0f4c8" +uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" +version = "0.4.4" + +[[Mmap]] +uuid = "a63ad114-7e13-5084-954f-fe012c677804" + +[[Mocking]] +deps = ["ExprTools"] +git-tree-sha1 = "916b850daad0d46b8c71f65f719c49957e9513ed" +uuid = "78c3b35d-d492-501b-9361-3d52fe80e533" +version = "0.7.1" + +[[MultivariateStats]] +deps = ["Arpack", "LinearAlgebra", "SparseArrays", "Statistics", "StatsBase"] +git-tree-sha1 = "352fae519b447bf52e6de627b89f448bcd469e4e" +uuid = "6f286f6a-111f-5878-ab1e-185364afe411" +version = "0.7.0" + +[[NaNMath]] +git-tree-sha1 = "bfe47e760d60b82b66b61d2d44128b62e3a369fb" +uuid = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3" +version = "0.3.5" + +[[NamedArrays]] +deps = ["Combinatorics", "DataStructures", "DelimitedFiles", "InvertedIndices", "LinearAlgebra", "Random", "Requires", "SparseArrays", "Statistics"] +git-tree-sha1 = "7d96d4c09526458d66ff84d7648be7eb7c38a547" +uuid = "86f7a689-2022-50b4-a561-43c23ac3c673" +version = "0.9.4" + +[[NearestNeighbors]] +deps = ["Distances", "StaticArrays"] +git-tree-sha1 = "da77adc83db31176804ce8307e61ef5bedca2e58" +uuid = "b8a86587-4115-5ab1-83bc-aa920d37bbce" +version = "0.4.7" + +[[Observables]] +git-tree-sha1 = "635fe10760447cfa86f5118edf2f47eb864fb495" +uuid = "510215fc-4207-5dde-b226-833fc4488ee2" +version = "0.3.2" + +[[OffsetArrays]] +deps = ["Adapt"] +git-tree-sha1 = "9db93b990af57b3a56dca38476832f60d58f777b" +uuid = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" +version = "1.4.0" + +[[Ogg_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "a42c0f138b9ebe8b58eba2271c5053773bde52d0" +uuid = "e7412a2a-1a6e-54c0-be00-318e2571c051" +version = "1.3.4+2" + +[[OpenBLAS_jll]] +deps = ["CompilerSupportLibraries_jll", "Libdl", "Pkg"] +git-tree-sha1 = "0c922fd9634e358622e333fc58de61f05a048492" +uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" +version = "0.3.9+5" + +[[OpenSSL_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "71bbbc616a1d710879f5a1021bcba65ffba6ce58" +uuid = "458c3c95-2e84-50aa-8efc-19380b2a3a95" +version = "1.1.1+6" + +[[OpenSpecFun_jll]] +deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "9db77584158d0ab52307f8c04f8e7c08ca76b5b3" +uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" +version = "0.5.3+4" + +[[Opus_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "f9d57f4126c39565e05a2b0264df99f497fc6f37" +uuid = "91d4177d-7536-5919-b921-800302f37372" +version = "1.3.1+3" + +[[OrderedCollections]] +git-tree-sha1 = "cf59cfed2e2c12e8a2ff0a4f1e9b2cd8650da6db" +uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" +version = "1.3.2" + +[[PCRE_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "1b556ad51dceefdbf30e86ffa8f528b73c7df2bb" +uuid = "2f80f16e-611a-54ab-bc61-aa92de5b98fc" +version = "8.42.0+4" + +[[PDMats]] +deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse", "Test"] +git-tree-sha1 = "95a4038d1011dfdbde7cecd2ad0ac411e53ab1bc" +uuid = "90014a1f-27ba-587c-ab20-58faa44d9150" +version = "0.10.1" + +[[Parsers]] +deps = ["Dates"] +git-tree-sha1 = "b417be52e8be24e916e34b3d70ec2da7bdf56a68" +uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" +version = "1.0.12" + +[[Pipe]] +git-tree-sha1 = "6842804e7867b115ca9de748a0cf6b364523c16d" +uuid = "b98c9c47-44ae-5843-9183-064241ee97a0" +version = "1.3.0" + +[[Pixman_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "392d85fe2fd2c613442f9694dd566c0d5641d58c" +uuid = "30392449-352a-5448-841d-b1acce4e97dc" +version = "0.38.4+5" + +[[Pkg]] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" + +[[PlotThemes]] +deps = ["PlotUtils", "Requires", "Statistics"] +git-tree-sha1 = "c6f5ea535551b3b16835134697f0c65d06c94b91" +uuid = "ccf2f8ad-2431-5c83-bf29-c5338b663b6a" +version = "2.0.0" + +[[PlotUtils]] +deps = ["ColorSchemes", "Colors", "Dates", "Printf", "Random", "Reexport", "Statistics"] +git-tree-sha1 = "4e098f88dad9a2b518b83124a116be1c49e2b2bf" +uuid = "995b91a9-d308-5afd-9ec6-746e21dbc043" +version = "1.0.7" + +[[Plots]] +deps = ["Base64", "Contour", "Dates", "FFMPEG", "FixedPointNumbers", "GR", "GeometryBasics", "JSON", "Latexify", "LinearAlgebra", "Measures", "NaNMath", "PlotThemes", "PlotUtils", "Printf", "REPL", "Random", "RecipesBase", "RecipesPipeline", "Reexport", "Requires", "Scratch", "Showoff", "SparseArrays", "Statistics", "StatsBase", "UUIDs"] +git-tree-sha1 = "173c7250ccd7c98615b04c669eb13fa7fab494b0" +uuid = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +version = "1.9.1" + +[[PooledArrays]] +deps = ["DataAPI"] +git-tree-sha1 = "b1333d4eced1826e15adbdf01a4ecaccca9d353c" +uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +version = "0.5.3" + +[[PrettyTables]] +deps = ["Crayons", "Formatting", "Markdown", "Reexport", "Tables"] +git-tree-sha1 = "237170206bf38a66fee4d845f4ae57f63788eeb0" +uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" +version = "0.10.1" + +[[Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[Qhull_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "585989201bf8741e165ae52df54de79c5299daa7" +uuid = "784f63db-0788-585a-bace-daefebcd302b" +version = "2019.1.0+2" + +[[Qt_jll]] +deps = ["Artifacts", "Fontconfig_jll", "Glib_jll", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll", "Xorg_libxcb_jll", "Xorg_xcb_util_image_jll", "Xorg_xcb_util_keysyms_jll", "Xorg_xcb_util_renderutil_jll", "Xorg_xcb_util_wm_jll", "Zlib_jll", "xkbcommon_jll"] +git-tree-sha1 = "72244a8e084251aea25968c61bbf5c001aaa7d5a" +uuid = "ede63266-ebff-546c-83e0-1c6fb6d0efc8" +version = "5.15.1+0" + +[[QuadGK]] +deps = ["DataStructures", "LinearAlgebra"] +git-tree-sha1 = "12fbe86da16df6679be7521dfb39fbc861e1dc7b" +uuid = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" +version = "2.4.1" + +[[REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[Random]] +deps = ["Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[Ratios]] +git-tree-sha1 = "37d210f612d70f3f7d57d488cb3b6eff56ad4e41" +uuid = "c84ed2f1-dad5-54f0-aa8e-dbefe2724439" +version = "0.4.0" + +[[RecipesBase]] +git-tree-sha1 = "b3fb709f3c97bfc6e948be68beeecb55a0b340ae" +uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" +version = "1.1.1" + +[[RecipesPipeline]] +deps = ["Dates", "NaNMath", "PlotUtils", "RecipesBase"] +git-tree-sha1 = "9ea2f5bf1b26918b16e9f885bb8e05206bfc2144" +uuid = "01d81517-befc-4cb6-b9ec-a95719d0359c" +version = "0.2.1" + +[[Reexport]] +deps = ["Pkg"] +git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0" +uuid = "189a3867-3050-52da-a836-e630ba90ab69" +version = "0.2.0" + +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "e05c53ebc86933601d36212a93b39144a2733493" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.1.1" + +[[Rmath]] +deps = ["Random", "Rmath_jll"] +git-tree-sha1 = "86c5647b565873641538d8f812c04e4c9dbeb370" +uuid = "79098fc4-a85e-5d69-aa6a-4863f24498fa" +version = "0.6.1" + +[[Rmath_jll]] +deps = ["Libdl", "Pkg"] +git-tree-sha1 = "d76185aa1f421306dec73c057aa384bad74188f0" +uuid = "f50d1b31-88e8-58de-be2c-1cc44531875f" +version = "0.2.2+1" + +[[SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" + +[[Scratch]] +deps = ["Dates"] +git-tree-sha1 = "ad4b278adb62d185bbcb6864dc24959ab0627bf6" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.0.3" + +[[SentinelArrays]] +deps = ["Dates", "Random"] +git-tree-sha1 = "6ccde405cf0759eba835eb613130723cb8f10ff9" +uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" +version = "1.2.16" + +[[Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[SharedArrays]] +deps = ["Distributed", "Mmap", "Random", "Serialization"] +uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" + +[[Showoff]] +deps = ["Dates", "Grisu"] +git-tree-sha1 = "ee010d8f103468309b8afac4abb9be2e18ff1182" +uuid = "992d4aef-0814-514b-bc4d-f2e9a6c4116f" +version = "0.3.2" + +[[Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[SoftGlobalScope]] +deps = ["REPL"] +git-tree-sha1 = "986ec2b6162ccb95de5892ed17832f95badf770c" +uuid = "b85f4697-e234-5449-a836-ec8e2f98b302" +version = "1.1.0" + +[[SortingAlgorithms]] +deps = ["DataStructures", "Random", "Test"] +git-tree-sha1 = "03f5898c9959f8115e30bc7226ada7d0df554ddd" +uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" +version = "0.3.1" + +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[SpecialFunctions]] +deps = ["OpenSpecFun_jll"] +git-tree-sha1 = "bf68b90f72f81dd1519b289b7403c591cfdd6a88" +uuid = "276daf66-3868-5448-9aa4-cd146d93841b" +version = "1.0.0" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "da4cf579416c81994afd6322365d00916c79b8ae" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "0.12.5" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + +[[StatsBase]] +deps = ["DataAPI", "DataStructures", "LinearAlgebra", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics"] +git-tree-sha1 = "7bab7d4eb46b225b35179632852b595a3162cb61" +uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" +version = "0.33.2" + +[[StatsFuns]] +deps = ["Rmath", "SpecialFunctions"] +git-tree-sha1 = "3b9f665c70712af3264b61c27a7e1d62055dafd1" +uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" +version = "0.9.6" + +[[StatsPlots]] +deps = ["Clustering", "DataStructures", "DataValues", "Distributions", "Interpolations", "KernelDensity", "LinearAlgebra", "MultivariateStats", "Observables", "Plots", "RecipesBase", "RecipesPipeline", "Reexport", "StatsBase", "TableOperations", "Tables", "Widgets"] +git-tree-sha1 = "0904a834846e7f4796636171fe002368e755dffc" +uuid = "f3b207a7-027a-5e70-b257-86293d7955fd" +version = "0.14.17" + +[[StructArrays]] +deps = ["Adapt", "DataAPI", "Tables"] +git-tree-sha1 = "8099ed9fb90b6e754d6ba8c6ed8670f010eadca0" +uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" +version = "0.4.4" + +[[StructTypes]] +deps = ["Dates", "UUIDs"] +git-tree-sha1 = "1ed04f622a39d2e5a6747c3a70be040c00333933" +uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" +version = "1.1.0" + +[[SuiteSparse]] +deps = ["Libdl", "LinearAlgebra", "Serialization", "SparseArrays"] +uuid = "4607b0f0-06f3-5cda-b6b1-a6196a1729e9" + +[[Syslogs]] +deps = ["Printf", "Sockets"] +git-tree-sha1 = "46badfcc7c6e74535cc7d833a91f4ac4f805f86d" +uuid = "cea106d9-e007-5e6c-ad93-58fe2094e9c4" +version = "0.3.0" + +[[TableOperations]] +deps = ["SentinelArrays", "Tables", "Test"] +git-tree-sha1 = "85490cabedd41c56cf7574daec34769e0e2851b9" +uuid = "ab02a1b2-a7df-11e8-156e-fb1833f50b87" +version = "0.3.0" + +[[TableTraits]] +deps = ["IteratorInterfaceExtensions"] +git-tree-sha1 = "b1ad568ba658d8cbb3b892ed5380a6f3e781a81e" +uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" +version = "1.0.0" + +[[Tables]] +deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "TableTraits", "Test"] +git-tree-sha1 = "5131a624173d532299d1c7eb05341c18112b21b8" +uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +version = "1.2.1" + +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[[TimeZones]] +deps = ["Dates", "EzXML", "Mocking", "Pkg", "Printf", "RecipesBase", "Serialization", "Unicode"] +git-tree-sha1 = "e8a5ab7e56d23bf147585001d33d969c655d4091" +uuid = "f269a46b-ccf7-5d73-abea-4c690281aa53" +version = "1.5.2" + +[[TranscodingStreams]] +deps = ["Random", "Test"] +git-tree-sha1 = "7c53c35547de1c5b9d46a4797cf6d8253807108c" +uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" +version = "0.9.5" + +[[UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[VersionParsing]] +git-tree-sha1 = "80229be1f670524750d905f8fc8148e5a8c4537f" +uuid = "81def892-9a0e-5fdd-b105-ffc91e053289" +version = "1.2.0" + +[[Wayland_jll]] +deps = ["Artifacts", "Expat_jll", "JLLWrappers", "Libdl", "Libffi_jll", "Pkg", "XML2_jll"] +git-tree-sha1 = "dc643a9b774da1c2781413fd7b6dcd2c56bb8056" +uuid = "a2964d1f-97da-50d4-b82a-358c7fce9d89" +version = "1.17.0+4" + +[[Wayland_protocols_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Wayland_jll"] +git-tree-sha1 = "2839f1c1296940218e35df0bbb220f2a79686670" +uuid = "2381bf8a-dfd0-557d-9999-79630e7b1b91" +version = "1.18.0+4" + +[[WeakRefStrings]] +deps = ["DataAPI", "Random", "Test"] +git-tree-sha1 = "28807f85197eaad3cbd2330386fac1dcb9e7e11d" +uuid = "ea10d353-3f73-51f8-a26c-33c1cb351aa5" +version = "0.6.2" + +[[Widgets]] +deps = ["Colors", "Dates", "Observables", "OrderedCollections"] +git-tree-sha1 = "fc0feda91b3fef7fe6948ee09bb628f882b49ca4" +uuid = "cc8bc4a8-27d6-5769-a93b-9d913e69aa62" +version = "0.6.2" + +[[WoodburyMatrices]] +deps = ["LinearAlgebra", "SparseArrays"] +git-tree-sha1 = "59e2ad8fd1591ea019a5259bd012d7aee15f995c" +uuid = "efce3f68-66dc-5838-9240-27a6d6f5f9b6" +version = "0.5.3" + +[[XML2_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Pkg", "Zlib_jll"] +git-tree-sha1 = "be0db24f70aae7e2b89f2f3092e93b8606d659a6" +uuid = "02c8fc9c-b97f-50b9-bbe4-9be30ff0a78a" +version = "2.9.10+3" + +[[XSLT_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Libgcrypt_jll", "Pkg", "XML2_jll"] +git-tree-sha1 = "2b3eac39df218762d2d005702d601cd44c997497" +uuid = "aed1982a-8fda-507f-9586-7b0439959a61" +version = "1.1.33+4" + +[[Xorg_libX11_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxcb_jll", "Xorg_xtrans_jll"] +git-tree-sha1 = "5be649d550f3f4b95308bf0183b82e2582876527" +uuid = "4f6342f7-b3d2-589e-9d20-edeb45f2b2bc" +version = "1.6.9+4" + +[[Xorg_libXau_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4e490d5c960c314f33885790ed410ff3a94ce67e" +uuid = "0c0b7dd1-d40b-584c-a123-a41640f87eec" +version = "1.0.9+4" + +[[Xorg_libXcursor_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXfixes_jll", "Xorg_libXrender_jll"] +git-tree-sha1 = "12e0eb3bc634fa2080c1c37fccf56f7c22989afd" +uuid = "935fb764-8cf2-53bf-bb30-45bb1f8bf724" +version = "1.2.0+4" + +[[Xorg_libXdmcp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "4fe47bd2247248125c428978740e18a681372dd4" +uuid = "a3789734-cfe1-5b06-b2d0-1dd0d9d62d05" +version = "1.1.3+4" + +[[Xorg_libXext_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] +git-tree-sha1 = "b7c0aa8c376b31e4852b360222848637f481f8c3" +uuid = "1082639a-0dae-5f34-9b06-72781eeb8cb3" +version = "1.3.4+4" + +[[Xorg_libXfixes_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] +git-tree-sha1 = "0e0dc7431e7a0587559f9294aeec269471c991a4" +uuid = "d091e8ba-531a-589c-9de9-94069b037ed8" +version = "5.0.3+4" + +[[Xorg_libXi_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll", "Xorg_libXfixes_jll"] +git-tree-sha1 = "89b52bc2160aadc84d707093930ef0bffa641246" +uuid = "a51aa0fd-4e3c-5386-b890-e753decda492" +version = "1.7.10+4" + +[[Xorg_libXinerama_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll"] +git-tree-sha1 = "26be8b1c342929259317d8b9f7b53bf2bb73b123" +uuid = "d1454406-59df-5ea1-beac-c340f2130bc3" +version = "1.1.4+4" + +[[Xorg_libXrandr_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libXext_jll", "Xorg_libXrender_jll"] +git-tree-sha1 = "34cea83cb726fb58f325887bf0612c6b3fb17631" +uuid = "ec84b674-ba8e-5d96-8ba1-2a689ba10484" +version = "1.5.2+4" + +[[Xorg_libXrender_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] +git-tree-sha1 = "19560f30fd49f4d4efbe7002a1037f8c43d43b96" +uuid = "ea2f1a96-1ddc-540d-b46f-429655e07cfa" +version = "0.9.10+4" + +[[Xorg_libpthread_stubs_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6783737e45d3c59a4a4c4091f5f88cdcf0908cbb" +uuid = "14d82f49-176c-5ed1-bb49-ad3f5cbd8c74" +version = "0.1.0+3" + +[[Xorg_libxcb_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "XSLT_jll", "Xorg_libXau_jll", "Xorg_libXdmcp_jll", "Xorg_libpthread_stubs_jll"] +git-tree-sha1 = "daf17f441228e7a3833846cd048892861cff16d6" +uuid = "c7cfdc94-dc32-55de-ac96-5a1b8d977c5b" +version = "1.13.0+3" + +[[Xorg_libxkbfile_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libX11_jll"] +git-tree-sha1 = "926af861744212db0eb001d9e40b5d16292080b2" +uuid = "cc61e674-0454-545c-8b26-ed2c68acab7a" +version = "1.1.0+4" + +[[Xorg_xcb_util_image_jll]] +deps = ["Libdl", "Pkg", "Xorg_xcb_util_jll"] +git-tree-sha1 = "16eb9a5aa027fb877207bf9915686366c2d5c064" +uuid = "12413925-8142-5f55-bb0e-6d7ca50bb09b" +version = "0.4.0+0" + +[[Xorg_xcb_util_jll]] +deps = ["Libdl", "Pkg", "Xorg_libxcb_jll"] +git-tree-sha1 = "6b47a94261a67078fe3d3922363bd9fd83b6eb1d" +uuid = "2def613f-5ad1-5310-b15b-b15d46f528f5" +version = "0.4.0+0" + +[[Xorg_xcb_util_keysyms_jll]] +deps = ["Libdl", "Pkg", "Xorg_xcb_util_jll"] +git-tree-sha1 = "40771f688e17baa121136b649e631e1868a6678e" +uuid = "975044d2-76e6-5fbe-bf08-97ce7c6574c7" +version = "0.4.0+0" + +[[Xorg_xcb_util_renderutil_jll]] +deps = ["Libdl", "Pkg", "Xorg_xcb_util_jll"] +git-tree-sha1 = "72c9b59211a97f763a9ca82351d37ebc04a6858a" +uuid = "0d47668e-0667-5a69-a72c-f761630bfb7e" +version = "0.3.9+0" + +[[Xorg_xcb_util_wm_jll]] +deps = ["Libdl", "Pkg", "Xorg_xcb_util_jll"] +git-tree-sha1 = "6ed52e9bfb2421f01ee62e1a5a30eba5f3f29c74" +uuid = "c22f9ab0-d5fe-5066-847c-f4bb1cd4e361" +version = "0.4.1+0" + +[[Xorg_xkbcomp_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_libxkbfile_jll"] +git-tree-sha1 = "4bcbf660f6c2e714f87e960a171b119d06ee163b" +uuid = "35661453-b289-5fab-8a00-3d9160c6a3a4" +version = "1.4.2+4" + +[[Xorg_xkeyboard_config_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Xorg_xkbcomp_jll"] +git-tree-sha1 = "5c8424f8a67c3f2209646d4425f3d415fee5931d" +uuid = "33bec58e-1273-512f-9401-5d533626f822" +version = "2.27.0+4" + +[[Xorg_xtrans_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "79c31e7844f6ecf779705fbc12146eb190b7d845" +uuid = "c5fb5394-a638-5e4d-96e5-b29de1b5cf10" +version = "1.4.0+3" + +[[ZMQ]] +deps = ["FileWatching", "Sockets", "ZeroMQ_jll"] +git-tree-sha1 = "fc68e8a3719166950a0f3e390a14c7302c48f8de" +uuid = "c2297ded-f4af-51ae-bb23-16f91089e4e1" +version = "1.2.1" + +[[ZeroMQ_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "bba617292e040408cb72baa03c20f43583bf239f" +uuid = "8f1865be-045e-5c20-9c9f-bfbfb0764568" +version = "4.3.2+5" + +[[ZipFile]] +deps = ["Libdl", "Printf", "Zlib_jll"] +git-tree-sha1 = "c3a5637e27e914a7a445b8d0ad063d701931e9f7" +uuid = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" +version = "0.9.3" + +[[Zlib_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "320228915c8debb12cb434c59057290f0834dbf6" +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.11+18" + +[[Zstd_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "6f1abcb0c44f184690912aa4b0ba861dd64f11b9" +uuid = "3161d3a3-bdf6-5164-811a-617609db77b4" +version = "1.4.5+2" + +[[libass_jll]] +deps = ["Artifacts", "Bzip2_jll", "FreeType2_jll", "FriBidi_jll", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "acc685bcf777b2202a904cdcb49ad34c2fa1880c" +uuid = "0ac62f75-1d6f-5e53-bd7c-93b484bb37c0" +version = "0.14.0+4" + +[[libfdk_aac_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "7a5780a0d9c6864184b3a2eeeb833a0c871f00ab" +uuid = "f638f0a6-7fb0-5443-88ba-1cc74229b280" +version = "0.1.6+4" + +[[libpng_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Zlib_jll"] +git-tree-sha1 = "6abbc424248097d69c0c87ba50fcb0753f93e0ee" +uuid = "b53b4c65-9356-5827-b1ea-8c7a1a84506f" +version = "1.6.37+6" + +[[libvorbis_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Ogg_jll", "Pkg"] +git-tree-sha1 = "fa14ac25af7a4b8a7f61b287a124df7aab601bcd" +uuid = "f27f6e37-5d2b-51aa-960f-b287f2bc3b7a" +version = "1.3.6+6" + +[[x264_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "d713c1ce4deac133e3334ee12f4adff07f81778f" +uuid = "1270edf5-f2f9-52d2-97e9-ab00b5d0237a" +version = "2020.7.14+2" + +[[x265_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] +git-tree-sha1 = "487da2f8f2f0c8ee0e83f39d13037d6bbf0a45ab" +uuid = "dfaa095f-4041-5dcd-9319-2fabd8486b76" +version = "3.0.0+3" + +[[xkbcommon_jll]] +deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "Wayland_jll", "Wayland_protocols_jll", "Xorg_libxcb_jll", "Xorg_xkeyboard_config_jll"] +git-tree-sha1 = "ece2350174195bb31de1a63bea3a41ae1aa593b6" +uuid = "d8fb68d0-12a3-5cfd-a85a-d49703b185fd" +version = "0.9.1+5" diff --git a/literate_notebooks/src-ES/Project.toml b/literate_notebooks/src-ES/Project.toml new file mode 100644 index 0000000..cfdb47d --- /dev/null +++ b/literate_notebooks/src-ES/Project.toml @@ -0,0 +1,22 @@ +[deps] +Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" +CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +DataFramesMeta = "1313f7d8-7da2-5740-9ea0-a2ca25f37964" +FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" +FreqTables = "da1fdf0e-e0ff-5433-a45f-9bb5ff651cb1" +IJulia = "7073ff75-c697-5162-941a-fcdaad2a7d2a" +JDF = "babc3d20-cd49-4f60-a736-a8f9c08892d3" +JLSO = "9da8a3cd-07a3-59c0-a743-3fdc52c30d11" +JSONTables = "b9914132-a727-11e9-1322-f18e41205b0b" +NamedArrays = "86f7a689-2022-50b4-a561-43c23ac3c673" +Pipe = "b98c9c47-44ae-5843-9183-064241ee97a0" +PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" +ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" From 9edb30d479b7bcfea0f1a040685fa0cd736df95c Mon Sep 17 00:00:00 2001 From: miguel raz Date: Tue, 20 Apr 2021 15:05:50 -0500 Subject: [PATCH 24/24] update stale 01_constructors.jl --- literate_notebooks/src/01_constructors.jl | 162 +++++++++++++++++----- 1 file changed, 126 insertions(+), 36 deletions(-) diff --git a/literate_notebooks/src/01_constructors.jl b/literate_notebooks/src/01_constructors.jl index 333a81e..5f2a7c5 100644 --- a/literate_notebooks/src/01_constructors.jl +++ b/literate_notebooks/src/01_constructors.jl @@ -19,74 +19,112 @@ DataFrame() # empty DataFrame # Or we could call the constructor using keyword arguments to add columns to the `DataFrame`. -DataFrame(A=1:3, B=rand(3), C=randstring.([3,3,3])) +DataFrame(A=1:3, B=rand(3), C=randstring.([3,3,3]), fixed=1) # We can create a `DataFrame` from a dictionary, in which case keys from the dictionary will be sorted to create the `DataFrame` columns. -x = Dict("A" => [1,2], "B" => [true, false], "C" => ['a', 'b']) +x = Dict("A" => [1,2], "B" => [true, false], "C" => ['a', 'b'], "fixed" => Ref([1,1])) DataFrame(x) +# This time we used Ref to protect a vector from being treated as a column and forcing broadcasting it into every row of `:fixed` column (note that the [1,1] vector is aliased in each row). # Rather than explicitly creating a dictionary first, as above, we could pass `DataFrame` arguments with the syntax of dictionary key-value pairs. -# -# Note that in this case, we use symbols to denote the column names and arguments are not sorted. For example, `:A`, the symbol, produces `A`, the name of the first column here: +# Note that in this case, we use `Symbols` to denote the column names and arguments are not sorted. For example, `:A`, the symbol, produces `A`, the name of the first column here: DataFrame(:A => [1,2], :B => [true, false], :C => ['a', 'b']) +# Although, in general, using `Symbols` rather than strings to denote column names is preferred (as it is faster) DataFrames.jl accepts passing strings as column names, so this also works: + +DataFrame(:A => [1,2], :B => [true, false], "C" => ['a', 'b']) + +# You can also pass a vector of pairs, which is useful if it is constructed programatically: +DataFrame(:A => [1,2], :B => [true, false], "C" => ['a', 'b'], :fixed => "const") + # Here we create a `DataFrame` from a vector of vectors, and each vector becomes a column. -DataFrame([rand(3) for i in 1:3]) +DataFrame([rand(3) for i in 1:3], :auto) + -# For now we can construct a single `DataFrame` from a `Vector` of atoms, creating a `DataFrame` with a single row. In future releases of DataFrames.jl, this will throw an error. +DataFrame([rand(3) for i in 1:3], [:x1, :x2, :x3]) -DataFrame(rand(3)) +DataFrame([rand(3) for i in 1:3], ["x1", "x2", "x3"]) -# Instead use a transposed vector if you have a vector of atoms (in this way you effectively pass a two dimensional array to the constructor which is supported). -DataFrame(transpose([1, 2, 3])) -# Pass a second argument to give the columns names. +# As you can see you either pass a vector of column names as a second argument or `:auto` in which case column names are generated automatically. +# +# In particular it is not allowed to pass a vector of scalars to DataFrame constructor. +DataFrame([1, 2, 3]) + +# +# Instead use a transposed vector if you have a vector of single values (in this way you effectively pass a two dimensional array to the constructor which is supported the same way as in vector of vectors case). +DataFrame(permutedims([1, 2, 3]), :auto) + +# You can also pass a vector of NamedUples to construct a `DataFrame`: + +v = [(a=1, b=2), (a=3, b=4)] +DataFrame(v) -DataFrame([1:3, 4:6, 7:9], [:A, :B, :C]) +# Alternatively you can pass a `NamedTuple` of vectors: +n = (a=1:3, b=11:13) +DataFrame(n) # Here we create a `DataFrame` from a matrix, -DataFrame(rand(3,4)) +DataFrame(rand(3,4), :auto) # and here we do the same but also pass column names. DataFrame(rand(3,4), Symbol.('a':'d')) -# We can also construct an uninitialized DataFrame. -# -# Here we pass column types, names and number of rows; we get `missing` in column :C because `Any >: Missing`. +# or +DataFrame(rand(3,4), string.('a':'d')) -DataFrame([Int, Float64, Any], [:A, :B, :C], 1) +# This is how you can create a dataframe with no rows, but with predefined columns and their types +DataFrame(A=Int[], B=Float64[], C=String[]) -# Here we create a `DataFrame`, but column `:C` is #undef and Jupyter has problem with displaying it. (This works OK at the REPL.) +# Finally, we can create a `DataFrame` by copying an existing `DataFrame`. # -# This will be fixed in next release of DataFrames! +# Note that `copy` also copies the vectors. +x = DataFrame(a=1:2, b='a':'b') +y = copy(x) +(x === y), isequal(x, y), (x.a == y.a), (x.a === y.a) -DataFrame([Int, Float64, String], [:A, :B, :C], 1) +# Calling DataFrame on a DataFrame object works like copy. +x = DataFrame(a=1:2, b='a':'b') +y = DataFrame(x) +(x === y), isequal(x, y), (x.a == y.a), (x.a === y.a) -# To initialize a `DataFrame` with column names, but no rows use +#You can avoid copying of columns of a data frame (if it is possible) by passing copycols=false keyword argument: +x = DataFrame(a=1:2, b='a':'b') +y = DataFrame(x, copycols=false) +(x === y), isequal(x, y), (x.a == y.a), (x.a === y.a) -DataFrame([Int, Float64, String], [:A, :B, :C], 0) +# The same rules applies to the other constructors +a = [1, 2, 3] +df1 = DataFrame(a=a) +df2 = DataFrame(a=a, copycols=false) +df1.a === a, df2.a === a -# This syntax gives us a quick way to create homogenous `DataFrame`. +# You can create a similar uninitialized DataFrame based on an original one: +x = DataFrame(a=1, b=1.0) -DataFrame(Int, 3, 5) +similar(x) -# This example is similar, but has nonhomogenous columns. +# number of rows in a new DataFrame can be passed as a second argument +similar(x, 0) -DataFrame([Int, Float64], 4) +similar(x, 2) -# Finally, we can create a `DataFrame` by copying an existing `DataFrame`. -# -# Note that `copy` creates a shallow copy. +# You can also create a new `DataFrame` from `SubDataFrame` or `DataFrameRow` (discussed in detail later in the tutorial; in particular although DataFrameRow is considered a 1-dimensional object similar to a `NamedTuple` it gets converted to a 1-row `DataFrame` for convinience) +df = view(x, [1,1], :) -y = DataFrame(x) -z = copy(x) -(x === y), (x === z), isequal(x, z) +typeof(sdf) + +DataFrame(sdf) + +dfr = x[1, :] + +DataFrame(dfr) # ### Conversion to a matrix # @@ -94,10 +132,12 @@ z = copy(x) x = DataFrame(x=1:2, y=["A", "B"]) -# We can create a matrix by passing this `DataFrame` to `Matrix`. +# We can create a matrix by passing this `DataFrame` to `Matrix` or `Array` Matrix(x) +Array(x) + # This would work even if the `DataFrame` had some `missing`s: x = DataFrame(x=1:2, y=[missing,"B"]) @@ -126,18 +166,68 @@ Matrix(x) Matrix{Int}(x) +# ### Conversion to NamedTuple related tabular structures +# First define some data frame +x = DataFrame(x = 1:2, y = ["A", "B"]) + +# Now we convert a `DataFrame` into a `NamedTuple` +ct = Tables.columntable(x) + +# Next we convert it into a vector of `NamedTuples` +rt = Tables.rowtable(x) + +# We can perform the conversions back to a DataFrame using a standard constructor call: +DataFrame(ct) + +DataFrame(rt) + +# ### Iterating data frame by rows or columns +# Sometiems it is useful to create a wrapper around a `DataFrame` that produces its rows or columns +# For iterating columns you can use the `eachcol` function +ec = eachcol(x) + +# `DataFrameColumns` object behaves as a vector (note though it is not `AbstractVector`) +ec isa AbstractVector + +ec[1] + +# but you can also index into it using column names: +ec["x"] + +# similarly `eachrow` creates a `DataFrameRows` object that is a vector of its rows +er = eachrow(x) + +# DataFrameRows is an `AbstractVector` +er isa AbstractVector + +er[end] + +# Note that both data frame and also `DataFrameColumns` and `DataFrameRows` objects are not type stable (they do not know the types of their columns). This is useful to avoid compilation cost if you have very wide data frames with heterogenous column types. +# However, often (especially if a data frame is narrows) it is useful to create a lazy iterator that produces `NamedTuples` for each row of the `DataFrame`. Its key benefit is that it is type stable (so it is useful when you want to perform some operations in a fast way on a small subset of columns of a `DataFrame` - this strategy is often used internally by DataFrames.jl package): +nti = Tables.namedtupleiterator(x) + +for row in enumerate(nti) + @show row +end + +# similarly to the previous options you can easily convert `NamedTupleIterator` back to a `DataFrame` +DataFrame(nti) + # ### Handling of duplicate column names # # We can pass the `makeunique` keyword argument to allow passing duplicate names (they get deduplicated) df = DataFrame(:a=>1, :a=>2, :a_1=>3; makeunique=true) -# Otherwise, duplicates will not be allowed in the future. +# Otherwise, duplicates are not allowed. df = DataFrame(:a=>1, :a=>2, :a_1=>3) -# A constructor that is passed column names as keyword arguments is a corner case. -# You cannot pass `makeunique` to allow duplicates here. +# Observe that currently nothing is not printed when displaying a DataFrame in Jupyter Notebook: +df = DataFrame(x=[1, nothing], y=[nothing, "a"], z=[missing, "c"]) + -df = DataFrame(a=1, a=2, makeunique=true) +# Finally you can use `empty` and `empty!` functions to remove all rows from a data frame: +empty(df) +df