Skip to content

xarray support for categorical data #91

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ matrix:
env: PANDAS_VERSION_STR="=0.14.0 libgfortran=1.0"
# 0.18.0 has is_categorical_dtype in a different place than 0.19.0+
- python: 3.4
env: PANDAS_VERSION_STR="=0.18.0"
env: PANDAS_VERSION_STR="=0.18.0" XARRAY_VERSION_STR=">=0.7.0"
- python: 2.7
env: PANDAS_VERSION_STR="=0.18.0"
env: PANDAS_VERSION_STR="=0.18.0" XARRAY_VERSION_STR=">=0.7.0"
# make sure it works without pandas
- python: 3.5
env: PANDAS_VERSION_STR="NONE"
Expand All @@ -42,6 +42,7 @@ before_install:
- conda create -q -n testenv python=$TRAVIS_PYTHON_VERSION numpy scipy coverage nose pip
- source activate testenv
- if [ "$PANDAS_VERSION_STR" != "NONE" ]; then conda install pandas${PANDAS_VERSION_STR}; fi
- if [ -n "$XARRAY_VERSION_STR" ]; then conda install xarray${XARRAY_VERSION_STR}; fi
install:
- python setup.py sdist
- pip install dist/*
Expand Down
2 changes: 2 additions & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,8 @@ def setup(app):
None),
"pandas": ('http://pandas.pydata.org/pandas-docs/stable/',
None),
"xarray": ('http://http://xarray.pydata.org/en/stable/',
None),
}

autodoc_member_order = "source"
5 changes: 3 additions & 2 deletions doc/quickstart.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ variables:
Of course Patsy doesn't much care what sort of object you store
your data in, so long as it can be indexed like a Python dictionary,
``data[varname]``. You may prefer to store your data in a `pandas
<http://pandas.pydata.org>`_ DataFrame, or a numpy
record array... whatever makes you happy.
<http://pandas.pydata.org>`_ DataFrame, an
`xarray <http://xarray.pydata.org>`_ Dataset (using variables that have
1 or 2 dimensions), or a numpy record array... whatever makes you happy.

Now, let's generate design matrices suitable for regressing ``y`` onto
``x1`` and ``x2``.
Expand Down
10 changes: 10 additions & 0 deletions patsy/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,14 @@
pandas_Categorical_from_codes,
pandas_Categorical_categories,
pandas_Categorical_codes,
have_xarray,
safe_issubdtype,
no_pickling, assert_no_pickling)

if have_pandas:
import pandas
if have_xarray:
import xarray

# Objects of this type will always be treated as categorical, with the
# specified levels and contrast (if given).
Expand Down Expand Up @@ -188,6 +191,9 @@ def sniff(self, data):
else:
# unbox and fall through
data = data.data
if have_xarray:
if isinstance(data, xarray.DataArray):
data = data.values
if safe_is_pandas_categorical(data):
# pandas.Categorical has its own NA detection, so don't try to
# second-guess it.
Expand Down Expand Up @@ -324,6 +330,10 @@ def categorical_to_int(data, levels, NA_action, origin=None):
% (levels, tuple(data.levels)), origin)
data = data.data

if have_xarray:
if isinstance(data, xarray.DataArray):
data = data.values

data = _categorical_shape_fix(data)

try:
Expand Down
5 changes: 4 additions & 1 deletion patsy/highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,10 @@ def dmatrix(formula_like, data={}, eval_env=0,
:arg formula_like: An object that can be used to construct a design
matrix. See below.
:arg data: A dict-like object that can be used to look up variables
referenced in `formula_like`.
referenced in `formula_like`, including 1 or 2 dimensional
numpy record arrays, :class:`pandas.DataFrame` or
:class:`pandas.Series`, and :class:`xarray.DataArray` or
:class:`xarray.Dataset` objects.
:arg eval_env: Either a :class:`EvalEnvironment` which will be used to
look up any variables referenced in `formula_like` that cannot be
found in `data`, or else a depth represented as an
Expand Down
54 changes: 53 additions & 1 deletion patsy/test_highlevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,14 @@
from patsy.util import (have_pandas,
have_pandas_categorical,
have_pandas_categorical_dtype,
pandas_Categorical_from_codes)
pandas_Categorical_from_codes,
have_xarray)
from patsy.origin import Origin

if have_pandas:
import pandas
if have_xarray:
import xarray

def check_result(expect_full_designs, lhs, rhs, data,
expected_rhs_values, expected_rhs_names,
Expand Down Expand Up @@ -758,3 +761,52 @@ def test_C_and_pandas_categorical():
[[1, 0],
[1, 1],
[1, 0]])

def test_xarray_categorical_design():
if not have_xarray:
return
data = {
'num': (['time'], [0.0, 1.0, 2.0]),
'integer': (['time'], [0, 1, 2]),
'cat': (['time'], ['foo', 'bar', 'foo'])
}
ds = xarray.Dataset(data, coords={'time': [2000, 2001, 2002]})

# Test all possible numeric, categorical, integer with and without "C()"
# numeric + C(categorical)
dmat = dmatrix('1 + num + C(cat)', ds)
assert dmat.design_info.column_names == ["Intercept",
"C(cat)[T.foo]",
"num"]
assert np.allclose(dmat, np.vstack(([1, 1, 0],
[1, 0, 1],
[1, 1, 2])))

# numeric + categorical
dmat = dmatrix('1 + integer + cat', ds)
assert dmat.design_info.column_names == ["Intercept",
"cat[T.foo]",
"integer"]
assert np.allclose(dmat, np.vstack(([1, 1, 0],
[1, 0, 1],
[1, 1, 2])))

# numeric + C(integer)
dmat = dmatrix('1 + num + C(integer)', ds)
assert dmat.design_info.column_names == ["Intercept",
"C(integer)[T.1]",
"C(integer)[T.2]",
"num"]
assert np.allclose(dmat, np.vstack(([1, 0, 0, 0],
[1, 1, 0, 1],
[1, 0, 1, 2])))

data['cat'] = (['time'], ['1', '0', '1'])
ds = xarray.Dataset(data, coords={'time': [2000, 2001, 2002]})
dmat = dmatrix('1 + num + cat', ds)
assert dmat.design_info.column_names == ["Intercept",
"cat[T.1]",
"num"]
assert np.allclose(dmat, np.column_stack(([1, 1, 1],
[1, 0, 1],
[0, 1, 2])))
9 changes: 8 additions & 1 deletion patsy/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"repr_pretty_delegate", "repr_pretty_impl",
"SortAnythingKey", "safe_scalar_isnan", "safe_isnan",
"iterable",
"have_pandas",
"have_pandas", "have_xarray",
"have_pandas_categorical",
"have_pandas_categorical_dtype",
"pandas_Categorical_from_codes",
Expand All @@ -36,6 +36,13 @@
else:
have_pandas = True

try:
import xarray
except ImportError:
have_xarray = False
else:
have_xarray = True

# Pandas versions < 0.9.0 don't have Categorical
# Can drop this guard whenever we drop support for such older versions of
# pandas.
Expand Down
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ changedir={envdir}
commands=
pip install scipy
sh -c "[ \"x$NO_PANDAS\" = xTRUE ] || pip install pandas"
sh -c "[ \"x$NO_XARRAY\" = xTRUE ] || pip install xarray"
coverage run --rcfile={toxinidir}/.coveragerc -p {envbindir}/nosetests --all-modules patsy {posargs:}
env PATSY_AVOID_OPTIONAL_DEPENDENCIES=1 coverage run --rcfile={toxinidir}/.coveragerc -p {envbindir}/nosetests --all-modules patsy {posargs:}
coverage combine --rcfile={toxinidir}/.coveragerc
Expand Down