Skip to content

Commit

Permalink
Formula API (#7)
Browse files Browse the repository at this point in the history
* Added JuliaFormatter to the right folder

* Deleted the wrong file

* Blue Style docs/make.jl

* Added boilerplate and TODO stuff

* added make_yX to formula.jl

* fixed docstrings and docs/Project.toml

* fixed docstrings

* Update src/formula.jl

Co-authored-by: Rik Huijzer <[email protected]>

* removed Examples and better Tables.jl description

* added tests for formula

* formatted runtests and added approx to tests

* formatted src/formula.jl

* added an extra space in docstring

* fix format

* separated types from functions in StatsModels imports

* added standardize_predictors

* added atol=0.01 to tests

* Added MixedModels and GLM as dependencies

* Added DataAPI as a depedency

* refactored using/imports to TuringGLM.jl

* removed unused comments

* fixed model_matrix and model_response

* removed all dependencies and starting from scratch

* format stuff

* added all stuff from StatsModels formula and terms

* format stuff

* added formula tests

* added intercepts to formula and DummyCoding

* status

* added terms and tests

* tests for schema

* tests for error_messages

* moved tests for error to error_messages

* Apply suggestions from code review

Co-authored-by: Rik Huijzer <[email protected]>

* format fix

* fixed some tests and DummyCoding inner constructor

Co-authored-by: Rik Huijzer <[email protected]>
  • Loading branch information
storopoli and rikhuijzer authored Nov 23, 2021
1 parent 6e023e6 commit c31a978
Show file tree
Hide file tree
Showing 22 changed files with 1,951 additions and 108 deletions.
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
.jl.*.cov
*.jl.cov
*.jl.mem
/Manifest.toml
/test/Manifest.toml
**/Manifest.toml
/docs/build/

.vscode/
Expand Down
22 changes: 16 additions & 6 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,21 @@ uuid = "0004c1f4-53c5-4d43-a221-a1dac6cf6b74"
authors = ["Jose Storopoli <[email protected]> and contributors"]
version = "0.1.0"

[deps]
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"

[compat]
DataAPI = "1.9"
Reexport = "1.2"
TableOperations = "1.2"
Tables = "1.6"
Turing = "0.19"
julia = "1.6"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test"]
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ TuringGLM makes easy to specify Bayesian **G**eneralized **L**inear **M**odels u

Heavily inspired by [brms](https://github.com/paul-buerkner/brms/) (uses RStan or CmdStanR) and [bambi](https://github.com/bambinos/bambi) (uses PyMC3).

The `@formula` macro is extended from [`StatsModels.jl`](https://github.com/JuliaStats/StatsModels.jl).
The `@formula` macro is copied from [`StatsModels.jl`](https://github.com/JuliaStats/StatsModels.jl) along with [`MixedModels.jl`](https://github.com/JuliaStats/MixedModels.jl) for the random-effects (a.k.a. group-level predictors).

97 changes: 0 additions & 97 deletions docs/Manifest.toml

This file was deleted.

3 changes: 3 additions & 0 deletions docs/Project.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[deps]
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
TuringGLM = "0004c1f4-53c5-4d43-a221-a1dac6cf6b74"

[compat]
Documenter = "0.27"
24 changes: 23 additions & 1 deletion src/TuringGLM.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,27 @@
module TuringGLM

# Write your package code here.
using Statistics: mean, std, var
using DataAPI: DataAPI
using LinearAlgebra: I
using REPL: levenshtein
using StatsBase: StatsBase
using Tables: Tables
using TableOperations: TableOperations

using Reexport: @reexport

@reexport begin
using Turing
end

include("utils.jl")
include("contrasts.jl")
include("formula.jl")
include("terms.jl")
include("error_messages.jl")
include("schema.jl")
include("data_constructors.jl")

export @formula

end # module
218 changes: 218 additions & 0 deletions src/contrasts.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
abstract type AbstractContrasts end

# Contrasts + Levels (usually from data) = ContrastsMatrix
struct ContrastsMatrix{C<:AbstractContrasts,T,U}
matrix::Matrix{Float64}
termnames::Vector{U}
levels::Vector{T}
contrasts::C
invindex::Dict{T,Int}
function ContrastsMatrix(
matrix::AbstractMatrix, termnames::Vector{U}, levels::Vector{T}, contrasts::C
) where {U,T,C<:AbstractContrasts}
allunique(levels) ||
throw(ArgumentError("levels must be all unique, got $(levels)"))
invindex = Dict{T,Int}(x => i for (i, x) in enumerate(levels))
return new{C,T,U}(matrix, termnames, levels, contrasts, invindex)
end
end

"""
Base.:(==)
Only check equality of matrix, termnames, and levels, and that the type is the
same for the contrasts (values are irrelevant). This ensures that the two
will behave identically in creating modelmatrix columns.
"""
function Base.:(==)(
a::ContrastsMatrix{C,T}, b::ContrastsMatrix{C,T}
) where {C<:AbstractContrasts,T}
return a.matrix == b.matrix && a.termnames == b.termnames && a.levels == b.levels
end

function Base.hash(a::ContrastsMatrix{C}, h::UInt) where {C}
return hash(C, hash(a.matrix, hash(a.termnames, hash(a.levels, h))))
end

"""
ContrastsMatrix(contrasts::AbstractContrasts, levels::AbstractVector)
ContrastsMatrix(contrasts_matrix::ContrastsMatrix, levels::AbstractVector)
An instantiation of a contrast coding system for particular levels
This type is used internally for generating model matrices based on categorical
data, and **most users will not need to deal with it directly**. Conceptually,
a `ContrastsMatrix` object stands for an instantiation of a contrast coding
*system* for a particular set of categorical *data levels*.
If levels are specified in the `AbstractContrasts`, those will be used, and likewise
for the base level (which defaults to the first level).
# Constructors
```julia
ContrastsMatrix(contrasts::AbstractContrasts, levels::AbstractVector)
ContrastsMatrix(contrasts_matrix::ContrastsMatrix, levels::AbstractVector)
```
# Arguments
* `contrasts::AbstractContrasts`: The contrast coding system to use.
We only use `DummyCoding()`
* `levels::AbstractVector`: The levels to generate contrasts for.
* `contrasts_matrix::ContrastsMatrix`: Constructing a `ContrastsMatrix` from
another will check that the levels match. This is used, for example, in
constructing a model matrix from a `ModelFrame` using different data.
"""
function ContrastsMatrix(
contrasts::C, levels::AbstractVector{T}
) where {C<:AbstractContrasts,T}

# if levels are defined on contrasts, use those, validating that they line up.
# what does that mean? either:
#
# 1. DataAPI.levels(contrasts) == levels (best case)
# 2. data levels missing from contrast: would generate empty/undefined rows.
# better to filter data frame first
# 3. contrast levels missing from data: would have empty columns, generate a
# rank-deficient model matrix.
c_levels = something(DataAPI.levels(contrasts), levels)
if eltype(c_levels) != eltype(levels)
throw(
ArgumentError(
"mismatching levels types: got $(eltype(levels)), expected " *
"$(eltype(c_levels)) based on contrasts levels.",
),
)
end
mismatched_levels = symdiff(c_levels, levels)
if !isempty(mismatched_levels)
throw(
ArgumentError(
"contrasts levels not found in data or vice-versa: " *
"$mismatched_levels." *
"\n Data levels: $levels." *
"\n Contrast levels: $c_levels",
),
)
end

n = length(c_levels)
if n == 0
msg = "empty set of levels found (need at least two to compute " * "contrasts)."
throw(ArgumentError(msg))
elseif n == 1
throw(
ArgumentError(
"only one level found: $(c_levels[1]) (need at least two to " *
"compute contrasts).",
),
)
end

# find index of base level. use contrasts.base, then default (1).
base_level = baselevel(contrasts)
baseind = base_level === nothing ? 1 : findfirst(isequal(base_level), c_levels)
if baseind === nothing
throw(ArgumentError("base level $(base_level) not found in levels " * "$c_levels."))
end

tnames = termnames(contrasts, c_levels, baseind)

mat = contrasts_matrix(contrasts, baseind, n)

return ContrastsMatrix(mat, tnames, c_levels, contrasts)
end

function ContrastsMatrix(c::Type{<:AbstractContrasts}, levels::AbstractVector)
msg = "contrast types must be instantiated (use $c() instead of $c)"
return throw(ArgumentError(msg))
end

# given an existing ContrastsMatrix, check that all passed levels are present
# in the contrasts. Note that this behavior is different from the
# ContrastsMatrix constructor, which requires that the levels be exactly the same.
function ContrastsMatrix(c::ContrastsMatrix, levels::AbstractVector)
if !isempty(setdiff(levels, c.levels))
throw(
ArgumentError(
"there are levels in data that are not in ContrastsMatrix: " *
"$(setdiff(levels, c.levels))" *
"\n Data levels: $(levels)" *
"\n Contrast levels: $(c.levels)",
),
)
end
return c
end

function termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer)
not_base = [1:(baseind - 1); (baseind + 1):length(levels)]
return levels[not_base]
end

function Base.getindex(contrasts::ContrastsMatrix{C,T}, rowinds, colinds) where {C,T}
return getindex(contrasts.matrix, getindex.(Ref(contrasts.invindex), rowinds), colinds)
end

"""
DummyCoding([base[, levels]])
DummyCoding(; base=nothing, levels=nothing)
Dummy coding generates one indicator column (1 or 0) for each non-base level.
If `levels` are omitted or `nothing`, they are determined from the data
by calling the `levels` function on the data when constructing `ContrastsMatrix`.
If `base` is omitted or `nothing`, the first level is used as the base.
Columns have non-zero mean and are collinear with an intercept column (and
lower-order columns for interactions) but are orthogonal to each other. In a
regression model, dummy coding leads to an intercept that is the mean of the
dependent variable for base level.
Also known as "treatment coding" or "one-hot encoding".
"""
mutable struct DummyCoding <: AbstractContrasts
base::Any
levels::Union{AbstractVector,Nothing}
# constructor with optional keyword arguments, defaulting to nothing
function DummyCoding(; base=nothing, levels::Union{AbstractVector,Nothing}=nothing)
return new(base, levels)
end
end
baselevel(c::DummyCoding) = c.base
DataAPI.levels(c::DummyCoding) = c.levels

function contrasts_matrix(C::DummyCoding, baseind, n)
return Matrix(1.0I, n, n)[:, [1:(baseind - 1); (baseind + 1):n]]
end

"""
FullDummyCoding()
Full-rank dummy coding generates one indicator (1 or 0) column for each level,
**including** the base level. This is sometimes known as
[one-hot encoding](https://en.wikipedia.org/wiki/One-hot).
Needed internally for some situations where a categorical variable with ``k``
levels needs to be converted into ``k`` model matrix columns instead of the
standard ``k-1``.
"""
struct FullDummyCoding <: AbstractContrasts
# Dummy contrasts have no base level (since all levels produce a column)
end

function ContrastsMatrix(C::FullDummyCoding, levels::AbstractVector{T}) where {T}
return ContrastsMatrix(Matrix(1.0I, length(levels), length(levels)), levels, levels, C)
end

"Promote contrasts matrix to full rank version"
function Base.convert(::Type{ContrastsMatrix{FullDummyCoding}}, C::ContrastsMatrix)
return ContrastsMatrix(FullDummyCoding(), C.levels)
end

# fallback method for other types that might not have base or level fields
baselevel(c::AbstractContrasts) = nothing
DataAPI.levels(c::AbstractContrasts) = nothing
Loading

0 comments on commit c31a978

Please sign in to comment.