-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Added JuliaFormatter to the right folder * Deleted the wrong file * Blue Style docs/make.jl * Added boilerplate and TODO stuff * added make_yX to formula.jl * fixed docstrings and docs/Project.toml * fixed docstrings * Update src/formula.jl Co-authored-by: Rik Huijzer <[email protected]> * removed Examples and better Tables.jl description * added tests for formula * formatted runtests and added approx to tests * formatted src/formula.jl * added an extra space in docstring * fix format * separated types from functions in StatsModels imports * added standardize_predictors * added atol=0.01 to tests * Added MixedModels and GLM as dependencies * Added DataAPI as a depedency * refactored using/imports to TuringGLM.jl * removed unused comments * fixed model_matrix and model_response * removed all dependencies and starting from scratch * format stuff * added all stuff from StatsModels formula and terms * format stuff * added formula tests * added intercepts to formula and DummyCoding * status * added terms and tests * tests for schema * tests for error_messages * moved tests for error to error_messages * Apply suggestions from code review Co-authored-by: Rik Huijzer <[email protected]> * format fix * fixed some tests and DummyCoding inner constructor Co-authored-by: Rik Huijzer <[email protected]>
- Loading branch information
1 parent
6e023e6
commit c31a978
Showing
22 changed files
with
1,951 additions
and
108 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,7 @@ | ||
.jl.*.cov | ||
*.jl.cov | ||
*.jl.mem | ||
/Manifest.toml | ||
/test/Manifest.toml | ||
**/Manifest.toml | ||
/docs/build/ | ||
|
||
.vscode/ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,11 +3,21 @@ uuid = "0004c1f4-53c5-4d43-a221-a1dac6cf6b74" | |
authors = ["Jose Storopoli <[email protected]> and contributors"] | ||
version = "0.1.0" | ||
|
||
[deps] | ||
DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" | ||
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" | ||
REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" | ||
Reexport = "189a3867-3050-52da-a836-e630ba90ab69" | ||
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" | ||
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" | ||
TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87" | ||
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" | ||
Turing = "fce5fe82-541a-59a6-adf8-730c64b5f9a0" | ||
|
||
[compat] | ||
DataAPI = "1.9" | ||
Reexport = "1.2" | ||
TableOperations = "1.2" | ||
Tables = "1.6" | ||
Turing = "0.19" | ||
julia = "1.6" | ||
|
||
[extras] | ||
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" | ||
|
||
[targets] | ||
test = ["Test"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
[deps] | ||
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" | ||
TuringGLM = "0004c1f4-53c5-4d43-a221-a1dac6cf6b74" | ||
|
||
[compat] | ||
Documenter = "0.27" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,27 @@ | ||
module TuringGLM | ||
|
||
# Write your package code here. | ||
using Statistics: mean, std, var | ||
using DataAPI: DataAPI | ||
using LinearAlgebra: I | ||
using REPL: levenshtein | ||
using StatsBase: StatsBase | ||
using Tables: Tables | ||
using TableOperations: TableOperations | ||
|
||
using Reexport: @reexport | ||
|
||
@reexport begin | ||
using Turing | ||
end | ||
|
||
include("utils.jl") | ||
include("contrasts.jl") | ||
include("formula.jl") | ||
include("terms.jl") | ||
include("error_messages.jl") | ||
include("schema.jl") | ||
include("data_constructors.jl") | ||
|
||
export @formula | ||
|
||
end # module |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,218 @@ | ||
abstract type AbstractContrasts end | ||
|
||
# Contrasts + Levels (usually from data) = ContrastsMatrix | ||
struct ContrastsMatrix{C<:AbstractContrasts,T,U} | ||
matrix::Matrix{Float64} | ||
termnames::Vector{U} | ||
levels::Vector{T} | ||
contrasts::C | ||
invindex::Dict{T,Int} | ||
function ContrastsMatrix( | ||
matrix::AbstractMatrix, termnames::Vector{U}, levels::Vector{T}, contrasts::C | ||
) where {U,T,C<:AbstractContrasts} | ||
allunique(levels) || | ||
throw(ArgumentError("levels must be all unique, got $(levels)")) | ||
invindex = Dict{T,Int}(x => i for (i, x) in enumerate(levels)) | ||
return new{C,T,U}(matrix, termnames, levels, contrasts, invindex) | ||
end | ||
end | ||
|
||
""" | ||
Base.:(==) | ||
Only check equality of matrix, termnames, and levels, and that the type is the | ||
same for the contrasts (values are irrelevant). This ensures that the two | ||
will behave identically in creating modelmatrix columns. | ||
""" | ||
function Base.:(==)( | ||
a::ContrastsMatrix{C,T}, b::ContrastsMatrix{C,T} | ||
) where {C<:AbstractContrasts,T} | ||
return a.matrix == b.matrix && a.termnames == b.termnames && a.levels == b.levels | ||
end | ||
|
||
function Base.hash(a::ContrastsMatrix{C}, h::UInt) where {C} | ||
return hash(C, hash(a.matrix, hash(a.termnames, hash(a.levels, h)))) | ||
end | ||
|
||
""" | ||
ContrastsMatrix(contrasts::AbstractContrasts, levels::AbstractVector) | ||
ContrastsMatrix(contrasts_matrix::ContrastsMatrix, levels::AbstractVector) | ||
An instantiation of a contrast coding system for particular levels | ||
This type is used internally for generating model matrices based on categorical | ||
data, and **most users will not need to deal with it directly**. Conceptually, | ||
a `ContrastsMatrix` object stands for an instantiation of a contrast coding | ||
*system* for a particular set of categorical *data levels*. | ||
If levels are specified in the `AbstractContrasts`, those will be used, and likewise | ||
for the base level (which defaults to the first level). | ||
# Constructors | ||
```julia | ||
ContrastsMatrix(contrasts::AbstractContrasts, levels::AbstractVector) | ||
ContrastsMatrix(contrasts_matrix::ContrastsMatrix, levels::AbstractVector) | ||
``` | ||
# Arguments | ||
* `contrasts::AbstractContrasts`: The contrast coding system to use. | ||
We only use `DummyCoding()` | ||
* `levels::AbstractVector`: The levels to generate contrasts for. | ||
* `contrasts_matrix::ContrastsMatrix`: Constructing a `ContrastsMatrix` from | ||
another will check that the levels match. This is used, for example, in | ||
constructing a model matrix from a `ModelFrame` using different data. | ||
""" | ||
function ContrastsMatrix( | ||
contrasts::C, levels::AbstractVector{T} | ||
) where {C<:AbstractContrasts,T} | ||
|
||
# if levels are defined on contrasts, use those, validating that they line up. | ||
# what does that mean? either: | ||
# | ||
# 1. DataAPI.levels(contrasts) == levels (best case) | ||
# 2. data levels missing from contrast: would generate empty/undefined rows. | ||
# better to filter data frame first | ||
# 3. contrast levels missing from data: would have empty columns, generate a | ||
# rank-deficient model matrix. | ||
c_levels = something(DataAPI.levels(contrasts), levels) | ||
if eltype(c_levels) != eltype(levels) | ||
throw( | ||
ArgumentError( | ||
"mismatching levels types: got $(eltype(levels)), expected " * | ||
"$(eltype(c_levels)) based on contrasts levels.", | ||
), | ||
) | ||
end | ||
mismatched_levels = symdiff(c_levels, levels) | ||
if !isempty(mismatched_levels) | ||
throw( | ||
ArgumentError( | ||
"contrasts levels not found in data or vice-versa: " * | ||
"$mismatched_levels." * | ||
"\n Data levels: $levels." * | ||
"\n Contrast levels: $c_levels", | ||
), | ||
) | ||
end | ||
|
||
n = length(c_levels) | ||
if n == 0 | ||
msg = "empty set of levels found (need at least two to compute " * "contrasts)." | ||
throw(ArgumentError(msg)) | ||
elseif n == 1 | ||
throw( | ||
ArgumentError( | ||
"only one level found: $(c_levels[1]) (need at least two to " * | ||
"compute contrasts).", | ||
), | ||
) | ||
end | ||
|
||
# find index of base level. use contrasts.base, then default (1). | ||
base_level = baselevel(contrasts) | ||
baseind = base_level === nothing ? 1 : findfirst(isequal(base_level), c_levels) | ||
if baseind === nothing | ||
throw(ArgumentError("base level $(base_level) not found in levels " * "$c_levels.")) | ||
end | ||
|
||
tnames = termnames(contrasts, c_levels, baseind) | ||
|
||
mat = contrasts_matrix(contrasts, baseind, n) | ||
|
||
return ContrastsMatrix(mat, tnames, c_levels, contrasts) | ||
end | ||
|
||
function ContrastsMatrix(c::Type{<:AbstractContrasts}, levels::AbstractVector) | ||
msg = "contrast types must be instantiated (use $c() instead of $c)" | ||
return throw(ArgumentError(msg)) | ||
end | ||
|
||
# given an existing ContrastsMatrix, check that all passed levels are present | ||
# in the contrasts. Note that this behavior is different from the | ||
# ContrastsMatrix constructor, which requires that the levels be exactly the same. | ||
function ContrastsMatrix(c::ContrastsMatrix, levels::AbstractVector) | ||
if !isempty(setdiff(levels, c.levels)) | ||
throw( | ||
ArgumentError( | ||
"there are levels in data that are not in ContrastsMatrix: " * | ||
"$(setdiff(levels, c.levels))" * | ||
"\n Data levels: $(levels)" * | ||
"\n Contrast levels: $(c.levels)", | ||
), | ||
) | ||
end | ||
return c | ||
end | ||
|
||
function termnames(C::AbstractContrasts, levels::AbstractVector, baseind::Integer) | ||
not_base = [1:(baseind - 1); (baseind + 1):length(levels)] | ||
return levels[not_base] | ||
end | ||
|
||
function Base.getindex(contrasts::ContrastsMatrix{C,T}, rowinds, colinds) where {C,T} | ||
return getindex(contrasts.matrix, getindex.(Ref(contrasts.invindex), rowinds), colinds) | ||
end | ||
|
||
""" | ||
DummyCoding([base[, levels]]) | ||
DummyCoding(; base=nothing, levels=nothing) | ||
Dummy coding generates one indicator column (1 or 0) for each non-base level. | ||
If `levels` are omitted or `nothing`, they are determined from the data | ||
by calling the `levels` function on the data when constructing `ContrastsMatrix`. | ||
If `base` is omitted or `nothing`, the first level is used as the base. | ||
Columns have non-zero mean and are collinear with an intercept column (and | ||
lower-order columns for interactions) but are orthogonal to each other. In a | ||
regression model, dummy coding leads to an intercept that is the mean of the | ||
dependent variable for base level. | ||
Also known as "treatment coding" or "one-hot encoding". | ||
""" | ||
mutable struct DummyCoding <: AbstractContrasts | ||
base::Any | ||
levels::Union{AbstractVector,Nothing} | ||
# constructor with optional keyword arguments, defaulting to nothing | ||
function DummyCoding(; base=nothing, levels::Union{AbstractVector,Nothing}=nothing) | ||
return new(base, levels) | ||
end | ||
end | ||
baselevel(c::DummyCoding) = c.base | ||
DataAPI.levels(c::DummyCoding) = c.levels | ||
|
||
function contrasts_matrix(C::DummyCoding, baseind, n) | ||
return Matrix(1.0I, n, n)[:, [1:(baseind - 1); (baseind + 1):n]] | ||
end | ||
|
||
""" | ||
FullDummyCoding() | ||
Full-rank dummy coding generates one indicator (1 or 0) column for each level, | ||
**including** the base level. This is sometimes known as | ||
[one-hot encoding](https://en.wikipedia.org/wiki/One-hot). | ||
Needed internally for some situations where a categorical variable with ``k`` | ||
levels needs to be converted into ``k`` model matrix columns instead of the | ||
standard ``k-1``. | ||
""" | ||
struct FullDummyCoding <: AbstractContrasts | ||
# Dummy contrasts have no base level (since all levels produce a column) | ||
end | ||
|
||
function ContrastsMatrix(C::FullDummyCoding, levels::AbstractVector{T}) where {T} | ||
return ContrastsMatrix(Matrix(1.0I, length(levels), length(levels)), levels, levels, C) | ||
end | ||
|
||
"Promote contrasts matrix to full rank version" | ||
function Base.convert(::Type{ContrastsMatrix{FullDummyCoding}}, C::ContrastsMatrix) | ||
return ContrastsMatrix(FullDummyCoding(), C.levels) | ||
end | ||
|
||
# fallback method for other types that might not have base or level fields | ||
baselevel(c::AbstractContrasts) = nothing | ||
DataAPI.levels(c::AbstractContrasts) = nothing |
Oops, something went wrong.