Replace infix ~ for formulas with a model macro (#9)

ararslan · web-flow · commit 1e86a5b2b21b · 2016-12-31T11:20:38.000-08:00
diff --git a/docs/src/contrasts.md b/docs/src/contrasts.md
@@ -22,7 +22,7 @@ The default contrast coding system is `DummyCoding`.  To override this, use
 the `contrasts` argument when constructing a `ModelFrame`:
 
 ```julia
-mf = ModelFrame(y ~ 1 + x, df, contrasts = Dict(:x => EffectsCoding()))
+mf = ModelFrame(@formula(y ~ 1 + x), df, contrasts = Dict(:x => EffectsCoding()))
 ```
 
 To change the contrast coding for one or more variables in place, use
diff --git a/docs/src/formula.md b/docs/src/formula.md
@@ -21,13 +21,16 @@ goal is to support any tabular data format that adheres to a minimal API,
 ## The `Formula` type
 
 The basic conceptual tool for this is the `Formula`, which has a left side and a
-right side, separated by `~`:
+right side, separated by `~`. Formulas are constructed using the `@formula` macro:
 
 ```jldoctest
-julia> y ~ 1 + a
+julia> @formula(y ~ 1 + a)
 Formula: y ~ 1 + a
 ```
 
+Note that the `@formula` macro **must** be called with parentheses to ensure that
+the formula is parsed properly.
+
 The left side of a formula conventionally represents *dependent* variables, and
 the right side *independent* variables (or regressors).  *Terms* are separated
 by `+`.  Basic terms are the integers `1` or `0`—evaluated as the presence or
@@ -43,7 +46,7 @@ It's often convenient to include main effects and interactions for a number of
 variables.  The `*` operator does this, expanding in the following way:
 
 ```jldoctest
-julia> Formula(StatsModels.Terms(y ~ 1 + a*b))
+julia> Formula(StatsModels.Terms(@formula(y ~ 1 + a*b)))
 Formula: y ~ 1 + a + b + a & b
 ```
 
@@ -54,21 +57,28 @@ This applies to higher-order interactions, too: `a*b*c` expands to the main
 effects, all two-way interactions, and the three way interaction `a&b&c`:
 
 ```jldoctest
-julia> Formula(StatsModels.Terms(y ~ 1 + a*b*c))
+julia> Formula(StatsModels.Terms(@formula(y ~ 1 + a*b*c)))
 Formula: y ~ 1 + a + b + c + a & b + a & c + b & c + &(a,b,c)
 ```
 
 Both the `*` and the `&` operators act like multiplication, and are distributive
 over addition:
 
 ```jldoctest
-julia> Formula(StatsModels.Terms(y ~ 1 + (a+b) & c))
+julia> Formula(StatsModels.Terms(@formula(y ~ 1 + (a+b) & c)))
 Formula: y ~ 1 + c & a + c & b
 
-julia> Formula(StatsModels.Terms(y ~ 1 + (a+b) * c))
+julia> Formula(StatsModels.Terms(@formula(y ~ 1 + (a+b) * c)))
 Formula: y ~ 1 + a + b + c + c & a + c & b
 ```
 
+You may be wondering why formulas in Julia require a macro, while in R they appear "bare."
+R supports nonstandard evaluation, allowing the formula to remain an unevaluated object
+while its terms are parsed out. Julia uses a much more standard evaluation mechanism,
+making this impossible using normal expressions. However, unlike R, Julia provides macros to
+explicitly indicate when code itself will be manipulated before it's evaluated. By constructing
+a formula using a macro, we're able to provide convenient, R-like syntax and semantics.
+
 ## The `ModelFrame` and `ModelMatrix` types
 
 The main use of `Formula`s is for fitting statistical models based on tabular
diff --git a/src/StatsModels.jl b/src/StatsModels.jl
@@ -9,7 +9,7 @@ using NullableArrays
 using CategoricalArrays
 
 
-export @~,
+export @formula,
        Formula,
        ModelFrame,
        ModelMatrix,
diff --git a/src/formula.jl b/src/formula.jl
@@ -1,6 +1,6 @@
 # Formulas for representing and working with linear-model-type expressions
-# Original by Harlan D. Harris.  Later modifications by John Myles White
-# and Douglas M. Bates.
+# Original by Harlan D. Harris.  Later modifications by John Myles White,
+# Douglas M. Bates, and other contributors.
 
 ## Formulas are written as expressions and parsed by the Julia parser.
 ## For example :(y ~ a + b + log(c))
@@ -12,16 +12,19 @@
 ## The rhs of a formula can be 1
 
 type Formula
-    lhs::@compat(Union{Symbol, Expr, Void})
-    rhs::@compat(Union{Symbol, Expr, Integer})
+    lhs::Union{Symbol, Expr, Void}
+    rhs::Union{Symbol, Expr, Integer}
 end
 
-macro ~(lhs, rhs)
-    ex = Expr(:call,
-              :Formula,
-              Base.Meta.quot(lhs),
-              Base.Meta.quot(rhs))
-    return ex
+macro formula(ex)
+    if (ex.head === :macrocall && ex.args[1] === Symbol("@~")) || (ex.head === :call && ex.args[1] === :(~))
+        length(ex.args) == 3 || error("malformed expression in formula")
+        lhs = Base.Meta.quot(ex.args[2])
+        rhs = Base.Meta.quot(ex.args[3])
+    else
+        error("expected formula separator ~, got $(ex.head)")
+    end
+    return Expr(:call, :Formula, lhs, rhs)
 end
 
 """
@@ -46,9 +49,7 @@ end
 Base.:(==)(t1::Terms, t2::Terms) = all(getfield(t1, f)==getfield(t2, f) for f in fieldnames(t1))
 
 function Base.show(io::IO, f::Formula)
-    print(io,
-          string("Formula: ",
-                 f.lhs == nothing ? "" : f.lhs, " ~ ", f.rhs))
+    print(io, "Formula: ", f.lhs === nothing ? "" : f.lhs, " ~ ", f.rhs)
 end
 
 # special operators in formulas
diff --git a/test/formula.jl b/test/formula.jl
@@ -12,8 +12,7 @@ using Compat
 # - support more transformations with I()?
 
 ## Formula parsing
-import StatsModels: @~, Formula
-import StatsModels.Terms
+import StatsModels: @formula, Formula, Terms
 
 ## totally empty
 t = Terms(Formula(nothing, 0))
@@ -23,87 +22,90 @@ t = Terms(Formula(nothing, 0))
 @test t.eterms == []
 
 ## empty RHS
-t = Terms(y ~ 0)
+t = Terms(@formula(y ~ 0))
 @test t.intercept == false
 @test t.terms == []
 @test t.eterms == [:y]
-t = Terms(y ~ -1)
+t = Terms(@formula(y ~ -1))
 @test t.intercept == false
 @test t.terms == []
 
 ## intercept-only
-t = Terms(y ~ 1)
+t = Terms(@formula(y ~ 1))
 @test t.response == true
 @test t.intercept == true
 @test t.terms == []
 @test t.eterms == [:y]
 
 ## terms add
-t = Terms(y ~ 1 + x1 + x2)
+t = Terms(@formula(y ~ 1 + x1 + x2))
 @test t.intercept == true
 @test t.terms == [:x1, :x2]
 @test t.eterms == [:y, :x1, :x2]
 
 ## implicit intercept behavior:
-t = Terms(y ~ x1 + x2)
+t = Terms(@formula(y ~ x1 + x2))
 @test t.intercept == true
 @test t.terms == [:x1, :x2]
 @test t.eterms == [:y, :x1, :x2]
 
 ## no intercept
-t = Terms(y ~ 0 + x1 + x2)
+t = Terms(@formula(y ~ 0 + x1 + x2))
 @test t.intercept == false
 @test t.terms == [:x1, :x2]
 
-@test t == Terms(y ~ -1 + x1 + x2) == Terms(y ~ x1 - 1 + x2) == Terms(y ~ x1 + x2 -1)
+@test t == Terms(@formula(y ~ -1 + x1 + x2)) == Terms(@formula(y ~ x1 - 1 + x2)) == Terms(@formula(y ~ x1 + x2 -1))
 
 ## can't subtract terms other than 1
-@test_throws ErrorException Terms(y ~ x1 - x2)
+@test_throws ErrorException Terms(@formula(y ~ x1 - x2))
 
-t = Terms(y ~ x1 & x2)
+t = Terms(@formula(y ~ x1 & x2))
 @test t.terms == [:(x1 & x2)]
 @test t.eterms == [:y, :x1, :x2]
 
 ## `*` expansion
-t = Terms(y ~ x1 * x2)
+t = Terms(@formula(y ~ x1 * x2))
 @test t.terms == [:x1, :x2, :(x1 & x2)]
 @test t.eterms == [:y, :x1, :x2]
 
 ## associative rule:
 ## +
-t = Terms(y ~ x1 + x2 + x3)
+t = Terms(@formula(y ~ x1 + x2 + x3))
 @test t.terms == [:x1, :x2, :x3]
 
 ## &
-t = Terms(y ~ x1 & x2 & x3)
+t = Terms(@formula(y ~ x1 & x2 & x3))
 @test t.terms == [:((&)(x1, x2, x3))]
 @test t.eterms == [:y, :x1, :x2, :x3]
 
 ## distributive property of + and &
-t = Terms(y ~ x1 & (x2 + x3))
+t = Terms(@formula(y ~ x1 & (x2 + x3)))
 @test t.terms == [:(x1&x2), :(x1&x3)]
 
 ## FAILS: ordering of expanded interaction terms is wrong
 ## (only has an observable effect when both terms are categorical and
 ## produce multiple model matrix columns that are multiplied together...)
 ##
-## t = Terms(y ~ (x2 + x3) & x1)
+## t = Terms(@formula(y ~ (x2 + x3)) & x1)
 ## @test t.terms == [:(x2&x1), :(x3&x1)]
 
 ## three-way *
-t = Terms(y ~ x1 * x2 * x3)
+t = Terms(@formula(y ~ x1 * x2 * x3))
 @test t.terms == [:x1, :x2, :x3,
                   :(x1&x2), :(x1&x3), :(x2&x3),
                   :((&)(x1, x2, x3))]
 @test t.eterms == [:y, :x1, :x2, :x3]
 
 ## Interactions with `1` reduce to main effect.  All fail at the moment.
-## t = Terms(y ~ 1 & x1)
+## t = Terms(@formula(y ~ 1 & x1))
 ## @test t.terms == [:x1]              # == [:(1 & x1)]
 ## @test t.eterms == [:y, :x1]
 
-## t = Terms(y ~ (1 + x1) & x2)
+## t = Terms(@formula(y ~ (1 + x1)) & x2)
 ## @test t.terms == [:x2, :(x1&x2)]    # == [:(1 & x1)]
 ## @test t.eterms == [:y, :x1, :x2]
 
+# Incorrect formula separator
+@test_throws ErrorException eval(:(@formula(y => x + 1)))
+
 end
diff --git a/test/modelmatrix.jl b/test/modelmatrix.jl
diff --git a/test/statsmodel.jl b/test/statsmodel.jl