Skip to content

Commit

Permalink
Split convert_observed_data and apply intX to int generators
Browse files Browse the repository at this point in the history
Previously, the `GeneratorAdapter` applied `floatX` to float data,
but kept the original integer dtypes.
`floatX` was then applied to everything by `convert_observed_data`.

This refactor changes the handling of integer-valued generator data,
such that `intX` is applied, and no `floatX` conversion takes place.
  • Loading branch information
michaelosthege committed May 23, 2024
1 parent 5c1c647 commit 6b8e8f3
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 34 deletions.
3 changes: 2 additions & 1 deletion docs/source/api/pytensorf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ PyTensor utils
join_nonshared_inputs
make_shared_replacements
generator
convert_observed_data
convert_generator_data
convert_data
19 changes: 13 additions & 6 deletions pymc/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@

import pymc as pm

from pymc.pytensorf import convert_observed_data
from pymc.pytensorf import convert_data
from pymc.vartypes import isgenerator

__all__ = [
"get_data",
Expand Down Expand Up @@ -98,7 +99,7 @@ def make_variable(self, gop, name=None):
def __init__(self, generator):
if not pm.vartypes.isgenerator(generator):
raise TypeError("Object should be generator like")
self.test_value = pm.smartfloatX(copy(next(generator)))
self.test_value = pm.smarttypeX(copy(next(generator)))
# make pickling potentially possible
self._yielded_test_value = False
self.gen = generator
Expand All @@ -110,7 +111,7 @@ def __next__(self):
self._yielded_test_value = True
return self.test_value
else:
return pm.smartfloatX(copy(next(self.gen)))
return pm.smarttypeX(copy(next(self.gen)))

# python2 generator
next = __next__
Expand Down Expand Up @@ -403,9 +404,15 @@ def Data(
)
name = model.name_for(name)

# `convert_observed_data` takes care of parameter `value` and
# transforms it to something digestible for PyTensor.
arr = convert_observed_data(value)
# Transform `value` it to something digestible for PyTensor.
if isgenerator(value):
raise NotImplementedError(
"Generator type data is no longer supported with pm.Data.",
# It messes up InferenceData and can't be the input to a SharedVariable.
)
else:
arr = convert_data(value)

if isinstance(arr, np.ma.MaskedArray):
raise NotImplementedError(
"Masked arrays or arrays with `nan` entries are not supported. "
Expand Down
26 changes: 16 additions & 10 deletions pymc/pytensorf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
from pytensor.tensor.subtensor import AdvancedIncSubtensor, AdvancedIncSubtensor1
from pytensor.tensor.variable import TensorVariable

from pymc.data import GenTensorVariable
from pymc.exceptions import NotConstantValueError
from pymc.util import makeiter
from pymc.vartypes import continuous_types, isgenerator, typefilter
Expand All @@ -74,17 +75,26 @@
"join_nonshared_inputs",
"make_shared_replacements",
"generator",
"convert_data",
"convert_generator_data",
"convert_observed_data",
"compile_pymc",
]


def convert_observed_data(data) -> np.ndarray | Variable:
"""Convert user provided dataset to accepted formats."""

if isgenerator(data):
return floatX(generator(data))
return convert_generator_data(data)
return convert_data(data)


def convert_generator_data(data) -> GenTensorVariable:
return generator(data)


def convert_data(data) -> np.ndarray | Variable:
ret: np.ndarray | Variable
if hasattr(data, "to_numpy") and hasattr(data, "isnull"):
# typically, but not limited to pandas objects
vals = data.to_numpy()
Expand Down Expand Up @@ -125,14 +135,10 @@ def convert_observed_data(data) -> np.ndarray | Variable:

# type handling to enable index variables when data is int:
if hasattr(data, "dtype"):
if "int" in str(data.dtype):
return intX(ret)
# otherwise, assume float:
else:
return floatX(ret)
# needed for uses of this function other than with pm.Data:
else:
return floatX(ret)
return smarttypeX(data)
# needed for uses of this function other than with pm.Data,
# which are primarily related to inferring shapes.
return floatX(ret)


@_as_tensor_variable.register(pd.Series)
Expand Down
46 changes: 29 additions & 17 deletions tests/test_pytensorf.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,12 @@
from pymc.exceptions import NotConstantValueError
from pymc.logprob.utils import ParameterValueError
from pymc.pytensorf import (
GeneratorOp,
collect_default_updates,
compile_pymc,
constant_fold,
convert_observed_data,
convert_data,
convert_generator_data,
extract_obs_data,
hessian,
hessian_diag,
Expand Down Expand Up @@ -188,9 +190,9 @@ def test_extract_obs_data():


@pytest.mark.parametrize("input_dtype", ["int32", "int64", "float32", "float64"])
def test_convert_observed_data(input_dtype):
def test_convert_data(input_dtype):
"""
Ensure that convert_observed_data returns the dense array, masked array,
Ensure that convert_data returns the dense array, masked array,
graph variable, TensorVariable, or sparse matrix as appropriate.
"""
# Create the various inputs to the function
Expand All @@ -206,12 +208,8 @@ def test_convert_observed_data(input_dtype):
missing_pandas_input = pd.DataFrame(missing_numpy_input)
masked_array_input = ma.array(dense_input, mask=(np.mod(dense_input, 2) == 0))

# Create a generator object. Apparently the generator object needs to
# yield numpy arrays.
square_generator = (np.array([i**2], dtype=int) for i in range(100))

# Alias the function to be tested
func = convert_observed_data
func = convert_data

#####
# Perform the various tests
Expand Down Expand Up @@ -255,21 +253,35 @@ def test_convert_observed_data(input_dtype):
else:
assert pytensor_output.dtype == intX

# Check function behavior with generator data
generator_output = func(square_generator)

# Output is wrapped with `pm.floatX`, and this unwraps
wrapped = generator_output.owner.inputs[0]
# Make sure the returned object has .set_gen and .set_default methods
assert hasattr(wrapped, "set_gen")
assert hasattr(wrapped, "set_default")
@pytest.mark.parametrize("input_dtype", ["int32", "int64", "float32", "float64"])
def test_convert_generator_data(input_dtype):
# Create a generator object producing NumPy arrays with the intended dtype.
# This is required to infer the correct dtype.
square_generator = (np.array([i**2], dtype=input_dtype) for i in range(100))

# Output is NOT wrapped with `pm.floatX`/`intX`,
# but produced from calling a special Op.
result = convert_generator_data(square_generator)
apply = result.owner
op = apply.op
# Make sure the returned object is an PyTensor TensorVariable
assert isinstance(wrapped, TensorVariable)
assert isinstance(result, TensorVariable)
assert isinstance(op, GeneratorOp), f"It's a {type(apply)}"
# There are no inputs - because it generates...
assert apply.inputs == []

# Evaluation results should have the correct* dtype!
# (*intX/floatX will be enforced!)
evaled = result.eval()
expected_dtype = pm.smarttypeX(np.array(1, dtype=input_dtype)).dtype
assert result.type.dtype == expected_dtype
assert evaled.dtype == np.dtype(expected_dtype)


def test_pandas_to_array_pandas_index():
data = pd.Index([1, 2, 3])
result = convert_observed_data(data)
result = convert_data(data)
expected = np.array([1, 2, 3])
np.testing.assert_array_equal(result, expected)

Expand Down

0 comments on commit 6b8e8f3

Please sign in to comment.