pydata · ceholden · Sep 6, 2016 · Nov 4, 2016 · Nov 5, 2016 · Nov 7, 2016
diff --git a/.travis.yml b/.travis.yml
@@ -14,9 +14,9 @@ matrix:
       env: PANDAS_VERSION_STR="=0.14.0 libgfortran=1.0"
     # 0.18.0 has is_categorical_dtype in a different place than 0.19.0+
     - python: 3.4
-      env: PANDAS_VERSION_STR="=0.18.0"
+      env: PANDAS_VERSION_STR="=0.18.0" XARRAY_VERSION_STR=">=0.7.0"
     - python: 2.7
-      env: PANDAS_VERSION_STR="=0.18.0"
+      env: PANDAS_VERSION_STR="=0.18.0" XARRAY_VERSION_STR=">=0.7.0"
     # make sure it works without pandas
     - python: 3.5
       env: PANDAS_VERSION_STR="NONE"
@@ -42,6 +42,7 @@ before_install:
   - conda create -q -n testenv python=$TRAVIS_PYTHON_VERSION numpy scipy coverage nose pip
   - source activate testenv
   - if [ "$PANDAS_VERSION_STR" != "NONE" ]; then conda install pandas${PANDAS_VERSION_STR}; fi
+  - if [ -n "$XARRAY_VERSION_STR" ]; then conda install xarray${XARRAY_VERSION_STR}; fi
 install:
   - python setup.py sdist
   - pip install dist/*

diff --git a/doc/conf.py b/doc/conf.py
@@ -236,6 +236,8 @@ def setup(app):
                                  None),
                        "pandas": ('http://pandas.pydata.org/pandas-docs/stable/',
                                   None),
+                       "xarray": ('http://http://xarray.pydata.org/en/stable/',
+                                  None),
                        }
 
 autodoc_member_order = "source"
diff --git a/doc/quickstart.rst b/doc/quickstart.rst
@@ -24,8 +24,9 @@ variables:
 Of course Patsy doesn't much care what sort of object you store
 your data in, so long as it can be indexed like a Python dictionary,
 ``data[varname]``. You may prefer to store your data in a `pandas
-<http://pandas.pydata.org>`_ DataFrame, or a numpy
-record array... whatever makes you happy.
+<http://pandas.pydata.org>`_ DataFrame, an
+`xarray <http://xarray.pydata.org>`_ Dataset (using variables that have
+1 or 2 dimensions), or a numpy record array... whatever makes you happy.
 
 Now, let's generate design matrices suitable for regressing ``y`` onto
 ``x1`` and ``x2``.

diff --git a/patsy/categorical.py b/patsy/categorical.py
@@ -45,11 +45,14 @@
                         pandas_Categorical_from_codes,
                         pandas_Categorical_categories,
                         pandas_Categorical_codes,
+                        have_xarray,
                         safe_issubdtype,
                         no_pickling, assert_no_pickling)
 
 if have_pandas:
     import pandas
+if have_xarray:
+    import xarray
 
 # Objects of this type will always be treated as categorical, with the
 # specified levels and contrast (if given).
@@ -188,6 +191,9 @@ def sniff(self, data):
             else:
                 # unbox and fall through
                 data = data.data
+        if have_xarray:
+            if isinstance(data, xarray.DataArray):
+                data = data.values
         if safe_is_pandas_categorical(data):
             # pandas.Categorical has its own NA detection, so don't try to
             # second-guess it.
@@ -324,6 +330,10 @@ def categorical_to_int(data, levels, NA_action, origin=None):
                              % (levels, tuple(data.levels)), origin)
         data = data.data
 
+    if have_xarray:
+        if isinstance(data, xarray.DataArray):
+            data = data.values
+
     data = _categorical_shape_fix(data)
 
     try:

diff --git a/patsy/highlevel.py b/patsy/highlevel.py
@@ -229,7 +229,10 @@ def dmatrix(formula_like, data={}, eval_env=0,
     :arg formula_like: An object that can be used to construct a design
       matrix. See below.
     :arg data: A dict-like object that can be used to look up variables
-      referenced in `formula_like`.
+      referenced in `formula_like`, including 1 or 2 dimensional
+      numpy record arrays, :class:`pandas.DataFrame` or
+      :class:`pandas.Series`, and :class:`xarray.DataArray` or
+      :class:`xarray.Dataset` objects.
     :arg eval_env: Either a :class:`EvalEnvironment` which will be used to
       look up any variables referenced in `formula_like` that cannot be
       found in `data`, or else a depth represented as an

diff --git a/patsy/test_highlevel.py b/patsy/test_highlevel.py
@@ -22,11 +22,14 @@
 from patsy.util import (have_pandas,
                         have_pandas_categorical,
                         have_pandas_categorical_dtype,
-                        pandas_Categorical_from_codes)
+                        pandas_Categorical_from_codes,
+                        have_xarray)
 from patsy.origin import Origin
 
 if have_pandas:
     import pandas
+if have_xarray:
+    import xarray
 
 def check_result(expect_full_designs, lhs, rhs, data,
                  expected_rhs_values, expected_rhs_names,
@@ -758,3 +761,52 @@ def test_C_and_pandas_categorical():
                            [[1, 0],
                             [1, 1],
                             [1, 0]])
+
+def test_xarray_categorical_design():
+    if not have_xarray:
+        return
+    data = {
+        'num': (['time'], [0.0, 1.0, 2.0]),
+        'integer': (['time'], [0, 1, 2]),
+        'cat': (['time'], ['foo', 'bar', 'foo'])
+    }
+    ds = xarray.Dataset(data, coords={'time': [2000, 2001, 2002]})
+
+    # Test all possible numeric, categorical, integer with and without "C()"
+    # numeric + C(categorical)
+    dmat = dmatrix('1 + num + C(cat)', ds)
+    assert dmat.design_info.column_names == ["Intercept",
+                                             "C(cat)[T.foo]",
+                                             "num"]
+    assert np.allclose(dmat, np.vstack(([1, 1, 0],
+                                        [1, 0, 1],
+                                        [1, 1, 2])))
+
+    # numeric + categorical
+    dmat = dmatrix('1 + integer + cat', ds)
+    assert dmat.design_info.column_names == ["Intercept",
+                                             "cat[T.foo]",
+                                             "integer"]
+    assert np.allclose(dmat, np.vstack(([1, 1, 0],
+                                        [1, 0, 1],
+                                        [1, 1, 2])))
+
+    # numeric + C(integer)
+    dmat = dmatrix('1 + num + C(integer)', ds)
+    assert dmat.design_info.column_names == ["Intercept",
+                                             "C(integer)[T.1]",
+                                             "C(integer)[T.2]",
+                                             "num"]
+    assert np.allclose(dmat, np.vstack(([1, 0, 0, 0],
+                                        [1, 1, 0, 1],
+                                        [1, 0, 1, 2])))
+
+    data['cat'] = (['time'], ['1', '0', '1'])
+    ds = xarray.Dataset(data, coords={'time': [2000, 2001, 2002]})
+    dmat = dmatrix('1 + num + cat', ds)
+    assert dmat.design_info.column_names == ["Intercept",
+                                             "cat[T.1]",
+                                             "num"]
+    assert np.allclose(dmat, np.column_stack(([1, 1, 1],
+                                              [1, 0, 1],
+                                              [0, 1, 2])))
diff --git a/patsy/util.py b/patsy/util.py
@@ -9,7 +9,7 @@
            "repr_pretty_delegate", "repr_pretty_impl",
            "SortAnythingKey", "safe_scalar_isnan", "safe_isnan",
            "iterable",
-           "have_pandas",
+           "have_pandas", "have_xarray",
            "have_pandas_categorical",
            "have_pandas_categorical_dtype",
            "pandas_Categorical_from_codes",
@@ -36,6 +36,13 @@
 else:
     have_pandas = True
 
+try:
+    import xarray
+except ImportError:
+    have_xarray = False
+else:
+    have_xarray = True
+
 # Pandas versions < 0.9.0 don't have Categorical
 # Can drop this guard whenever we drop support for such older versions of
 # pandas.

diff --git a/tox.ini b/tox.ini
@@ -16,6 +16,7 @@ changedir={envdir}
 commands=
   pip install scipy
   sh -c "[ \"x$NO_PANDAS\" = xTRUE ] || pip install pandas"
+  sh -c "[ \"x$NO_XARRAY\" = xTRUE ] || pip install xarray"
   coverage run --rcfile={toxinidir}/.coveragerc -p {envbindir}/nosetests --all-modules patsy {posargs:}
   env PATSY_AVOID_OPTIONAL_DEPENDENCIES=1 coverage run --rcfile={toxinidir}/.coveragerc -p {envbindir}/nosetests --all-modules patsy {posargs:}
   coverage combine --rcfile={toxinidir}/.coveragerc