Formula API (#7)

* Added JuliaFormatter to the right folder * Deleted the wrong file * Blue Style docs/make.jl * Added boilerplate and TODO stuff * added make_yX to formula.jl * fixed docstrings and docs/Project.toml * fixed docstrings * Update src/formula.jl Co-authored-by: Rik Huijzer <[email protected]> * removed Examples and better Tables.jl description * added tests for formula * formatted runtests and added approx to tests * formatted src/formula.jl * added an extra space in docstring * fix format * separated types from functions in StatsModels imports * added standardize_predictors * added atol=0.01 to tests * Added MixedModels and GLM as dependencies * Added DataAPI as a depedency * refactored using/imports to TuringGLM.jl * removed unused comments * fixed model_matrix and model_response * removed all dependencies and starting from scratch * format stuff * added all stuff from StatsModels formula and terms * format stuff * added formula tests * added intercepts to formula and DummyCoding * status * added terms and tests * tests for schema * tests for error_messages * moved tests for error to error_messages * Apply suggestions from code review Co-authored-by: Rik Huijzer <[email protected]> * format fix * fixed some tests and DummyCoding inner constructor Co-authored-by: Rik Huijzer <[email protected]>
TuringLang · Nov 23, 2021 · c31a978 · c31a978
1 parent 6e023e6
commit c31a978
Show file tree

Hide file tree

Showing 22 changed files with 1,951 additions and 108 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,7 @@
 .jl.*.cov
 *.jl.cov
 *.jl.mem
-/Manifest.toml
-/test/Manifest.toml
+**/Manifest.toml
 /docs/build/
 
 .vscode/

diff --git a/Project.toml b/Project.toml
@@ -3,11 +3,21 @@ uuid = "0004c1f4-53c5-4d43-a221-a1dac6cf6b74"
 authors = ["Jose Storopoli <[email protected]> and contributors"]
 version = "0.1.0"
 
+[deps]
+DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
+TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
+Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
+Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
+
 [compat]
+DataAPI = "1.9"
+Reexport = "1.2"
+TableOperations = "1.2"
+Tables = "1.6"
+Turing = "0.19"
 julia = "1.6"
-
-[extras]
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
-
-[targets]
-test = ["Test"]
diff --git a/README.md b/README.md
@@ -12,5 +12,5 @@ TuringGLM makes easy to specify Bayesian **G**eneralized **L**inear **M**odels u
 
 Heavily inspired by [brms](https://github.com/paul-buerkner/brms/) (uses RStan or CmdStanR) and [bambi](https://github.com/bambinos/bambi) (uses PyMC3).
 
-The `@formula` macro is extended from [`StatsModels.jl`](https://github.com/JuliaStats/StatsModels.jl).
+The `@formula` macro is copied from [`StatsModels.jl`](https://github.com/JuliaStats/StatsModels.jl) along with [`MixedModels.jl`](https://github.com/JuliaStats/MixedModels.jl) for the random-effects (a.k.a. group-level predictors).
 
diff --git a/docs/Manifest.toml b/docs/Manifest.toml
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,3 +1,6 @@
 [deps]
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 TuringGLM = "0004c1f4-53c5-4d43-a221-a1dac6cf6b74"
+
+[compat]
+Documenter = "0.27"
diff --git a/src/TuringGLM.jl b/src/TuringGLM.jl
@@ -1,5 +1,27 @@
 module TuringGLM
 
-# Write your package code here.
+using Statistics: mean, std, var
+using DataAPI: DataAPI
+using LinearAlgebra: I
+using REPL: levenshtein
+using StatsBase: StatsBase
+using Tables: Tables
+using TableOperations: TableOperations
 
+using Reexport: @reexport
+
+@reexport begin
+    using Turing
 end
+
+include("utils.jl")
+include("contrasts.jl")
+include("formula.jl")
+include("terms.jl")
+include("error_messages.jl")
+include("schema.jl")
+include("data_constructors.jl")
+
+export @formula
+
+end # module
diff --git a/src/contrasts.jl b/src/contrasts.jl
@@ -0,0 +1,218 @@
+abstract type AbstractContrasts end
+
+# Contrasts + Levels (usually from data) = ContrastsMatrix
+struct ContrastsMatrix{C<:AbstractContrasts,T,U}
+    matrix::Matrix{Float64}
+    termnames::Vector{U}
+    levels::Vector{T}
+    contrasts::C
+    invindex::Dict{T,Int}
+    function ContrastsMatrix(
+        matrix::AbstractMatrix, termnames::Vector{U}, levels::Vector{T}, contrasts::C
+    ) where {U,T,C<:AbstractContrasts}
+        allunique(levels) ||
+            throw(ArgumentError("levels must be all unique, got $(levels)"))
+        invindex = Dict{T,Int}(x => i for (i, x) in enumerate(levels))
+        return new{C,T,U}(matrix, termnames, levels, contrasts, invindex)
+    end
+end
+
+"""
+    Base.:(==)
+
+Only check equality of matrix, termnames, and levels, and that the type is the
+same for the contrasts (values are irrelevant).  This ensures that the two
+will behave identically in creating modelmatrix columns.
+"""
+function Base.:(==)(
+    a::ContrastsMatrix{C,T}, b::ContrastsMatrix{C,T}
+) where {C<:AbstractContrasts,T}
+    return a.matrix == b.matrix && a.termnames == b.termnames && a.levels == b.levels
+end
+
+function Base.hash(a::ContrastsMatrix{C}, h::UInt) where {C}
+    return hash(C, hash(a.matrix, hash(a.termnames, hash(a.levels, h))))
+end
+
+"""
+    ContrastsMatrix(contrasts::AbstractContrasts, levels::AbstractVector)
+    ContrastsMatrix(contrasts_matrix::ContrastsMatrix, levels::AbstractVector)
+
+An instantiation of a contrast coding system for particular levels
+
+This type is used internally for generating model matrices based on categorical
+data, and **most users will not need to deal with it directly**.  Conceptually,
+a `ContrastsMatrix` object stands for an instantiation of a contrast coding
+*system* for a particular set of categorical *data levels*.
+
+If levels are specified in the `AbstractContrasts`, those will be used, and likewise
+for the base level (which defaults to the first level).
+
+# Constructors
+
+```julia
+ContrastsMatrix(contrasts::AbstractContrasts, levels::AbstractVector)
+ContrastsMatrix(contrasts_matrix::ContrastsMatrix, levels::AbstractVector)
+```
+
+# Arguments
+
+* `contrasts::AbstractContrasts`: The contrast coding system to use.
+   We only use `DummyCoding()`
+
+* `levels::AbstractVector`: The levels to generate contrasts for.
+
+* `contrasts_matrix::ContrastsMatrix`: Constructing a `ContrastsMatrix` from
+  another will check that the levels match.  This is used, for example, in
+  constructing a model matrix from a `ModelFrame` using different data.
+"""
+function ContrastsMatrix(
+    contrasts::C, levels::AbstractVector{T}
+) where {C<:AbstractContrasts,T}
+
+    # if levels are defined on contrasts, use those, validating that they line up.
+    # what does that mean? either:
+    #
+    # 1. DataAPI.levels(contrasts) == levels (best case)
+    # 2. data levels missing from contrast: would generate empty/undefined rows.
+    #    better to filter data frame first
+    # 3. contrast levels missing from data: would have empty columns, generate a
+    #    rank-deficient model matrix.
+    c_levels = something(DataAPI.levels(contrasts), levels)
+    if eltype(c_levels) != eltype(levels)
+        throw(
+            ArgumentError(
+                "mismatching levels types: got $(eltype(levels)), expected " *
+                "$(eltype(c_levels)) based on contrasts levels.",
+            ),
+        )
+    end
+    mismatched_levels = symdiff(c_levels, levels)
+    if !isempty(mismatched_levels)
+        throw(
+            ArgumentError(
+                "contrasts levels not found in data or vice-versa: " *
+                "$mismatched_levels." *
+                "\n  Data levels: $levels." *
+                "\n  Contrast levels: $c_levels",
+            ),
+        )
+    end
+
+    n = length(c_levels)
+    if n == 0
+        msg = "empty set of levels found (need at least two to compute " * "contrasts)."
+        throw(ArgumentError(msg))
+    elseif n == 1
+        throw(
+            ArgumentError(
+                "only one level found: $(c_levels[1]) (need at least two to " *
+                "compute contrasts).",
+            ),
+        )
+    end
+
+    # find index of base level. use contrasts.base, then default (1).
+    base_level = baselevel(contrasts)
+    baseind = base_level === nothing ? 1 : findfirst(isequal(base_level), c_levels)
+    if baseind === nothing
+        throw(ArgumentError("base level $(base_level) not found in levels " * "$c_levels."))
+    end
+
+    tnames = termnames(contrasts, c_levels, baseind)
+
+    mat = contrasts_matrix(contrasts, baseind, n)
+
+    return ContrastsMatrix(mat, tnames, c_levels, contrasts)
+end
+
+function ContrastsMatrix(c::Type{<:AbstractContrasts}, levels::AbstractVector)
+    msg = "contrast types must be instantiated (use $c() instead of $c)"
+    return throw(ArgumentError(msg))
+end
+
+# given an existing ContrastsMatrix, check that all passed levels are present
+# in the contrasts. Note that this behavior is different from the
+# ContrastsMatrix constructor, which requires that the levels be exactly the same.
+function ContrastsMatrix(c::ContrastsMatrix, levels::AbstractVector)
+    if !isempty(setdiff(levels, c.levels))
+        throw(
+            ArgumentError(
+                "there are levels in data that are not in ContrastsMatrix: " *
+                "$(setdiff(levels, c.levels))" *
+                "\n  Data levels: $(levels)" *
+                "\n  Contrast levels: $(c.levels)",
+            ),
+        )
+    end
+    return c
+end
+
+function termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer)
+    not_base = [1:(baseind - 1); (baseind + 1):length(levels)]
+    return levels[not_base]
+end
+
+function Base.getindex(contrasts::ContrastsMatrix{C,T}, rowinds, colinds) where {C,T}
+    return getindex(contrasts.matrix, getindex.(Ref(contrasts.invindex), rowinds), colinds)
+end
+
+"""
+    DummyCoding([base[, levels]])
+    DummyCoding(; base=nothing, levels=nothing)
+
+Dummy coding generates one indicator column (1 or 0) for each non-base level.
+
+If `levels` are omitted or `nothing`, they are determined from the data
+by calling the `levels` function on the data when constructing `ContrastsMatrix`.
+If `base` is omitted or `nothing`, the first level is used as the base.
+
+Columns have non-zero mean and are collinear with an intercept column (and
+lower-order columns for interactions) but are orthogonal to each other. In a
+regression model, dummy coding leads to an intercept that is the mean of the
+dependent variable for base level.
+
+Also known as "treatment coding" or "one-hot encoding".
+"""
+mutable struct DummyCoding <: AbstractContrasts
+    base::Any
+    levels::Union{AbstractVector,Nothing}
+    # constructor with optional keyword arguments, defaulting to nothing
+    function DummyCoding(; base=nothing, levels::Union{AbstractVector,Nothing}=nothing)
+        return new(base, levels)
+    end
+end
+baselevel(c::DummyCoding) = c.base
+DataAPI.levels(c::DummyCoding) = c.levels
+
+function contrasts_matrix(C::DummyCoding, baseind, n)
+    return Matrix(1.0I, n, n)[:, [1:(baseind - 1); (baseind + 1):n]]
+end
+
+"""
+    FullDummyCoding()
+
+Full-rank dummy coding generates one indicator (1 or 0) column for each level,
+**including** the base level. This is sometimes known as
+[one-hot encoding](https://en.wikipedia.org/wiki/One-hot).
+
+Needed internally for some situations where a categorical variable with ``k``
+levels needs to be converted into ``k`` model matrix columns instead of the
+standard ``k-1``.
+"""
+struct FullDummyCoding <: AbstractContrasts
+    # Dummy contrasts have no base level (since all levels produce a column)
+end
+
+function ContrastsMatrix(C::FullDummyCoding, levels::AbstractVector{T}) where {T}
+    return ContrastsMatrix(Matrix(1.0I, length(levels), length(levels)), levels, levels, C)
+end
+
+"Promote contrasts matrix to full rank version"
+function Base.convert(::Type{ContrastsMatrix{FullDummyCoding}}, C::ContrastsMatrix)
+    return ContrastsMatrix(FullDummyCoding(), C.levels)
+end
+
+# fallback method for other types that might not have base or level fields
+baselevel(c::AbstractContrasts) = nothing
+DataAPI.levels(c::AbstractContrasts) = nothing
Original file line number	Diff line number	Diff line change
Expand Up		@@ -12,5 +12,5 @@ TuringGLM makes easy to specify Bayesian Generalized Linear Models u

		Heavily inspired by [brms](https://github.com/paul-buerkner/brms/) (uses RStan or CmdStanR) and [bambi](https://github.com/bambinos/bambi) (uses PyMC3).

		The `@formula` macro is extended from [`StatsModels.jl`](https://github.com/JuliaStats/StatsModels.jl).
		The `@formula` macro is copied from [`StatsModels.jl`](https://github.com/JuliaStats/StatsModels.jl) along with [`MixedModels.jl`](https://github.com/JuliaStats/MixedModels.jl) for the random-effects (a.k.a. group-level predictors).