diff --git a/docs/source/api/pytensorf.rst b/docs/source/api/pytensorf.rst index b03ccf2adf..4d4555f198 100644 --- a/docs/source/api/pytensorf.rst +++ b/docs/source/api/pytensorf.rst @@ -22,4 +22,5 @@ PyTensor utils join_nonshared_inputs make_shared_replacements generator - convert_observed_data + convert_generator_data + convert_data diff --git a/pymc/data.py b/pymc/data.py index 3c72bf64b8..c339869816 100644 --- a/pymc/data.py +++ b/pymc/data.py @@ -37,7 +37,8 @@ import pymc as pm -from pymc.pytensorf import convert_observed_data +from pymc.pytensorf import convert_data +from pymc.vartypes import isgenerator __all__ = [ "get_data", @@ -98,7 +99,7 @@ def make_variable(self, gop, name=None): def __init__(self, generator): if not pm.vartypes.isgenerator(generator): raise TypeError("Object should be generator like") - self.test_value = pm.smartfloatX(copy(next(generator))) + self.test_value = pm.smarttypeX(copy(next(generator))) # make pickling potentially possible self._yielded_test_value = False self.gen = generator @@ -110,7 +111,7 @@ def __next__(self): self._yielded_test_value = True return self.test_value else: - return pm.smartfloatX(copy(next(self.gen))) + return pm.smarttypeX(copy(next(self.gen))) # python2 generator next = __next__ @@ -403,9 +404,15 @@ def Data( ) name = model.name_for(name) - # `convert_observed_data` takes care of parameter `value` and - # transforms it to something digestible for PyTensor. - arr = convert_observed_data(value) + # Transform `value` it to something digestible for PyTensor. + if isgenerator(value): + raise NotImplementedError( + "Generator type data is no longer supported with pm.Data.", + # It messes up InferenceData and can't be the input to a SharedVariable. + ) + else: + arr = convert_data(value) + if isinstance(arr, np.ma.MaskedArray): raise NotImplementedError( "Masked arrays or arrays with `nan` entries are not supported. " diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py index edb1df5aaf..4c83fa4e39 100644 --- a/pymc/pytensorf.py +++ b/pymc/pytensorf.py @@ -52,6 +52,7 @@ from pytensor.tensor.subtensor import AdvancedIncSubtensor, AdvancedIncSubtensor1 from pytensor.tensor.variable import TensorVariable +from pymc.data import GenTensorVariable from pymc.exceptions import NotConstantValueError from pymc.util import makeiter from pymc.vartypes import continuous_types, isgenerator, typefilter @@ -74,6 +75,8 @@ "join_nonshared_inputs", "make_shared_replacements", "generator", + "convert_data", + "convert_generator_data", "convert_observed_data", "compile_pymc", ] @@ -81,10 +84,17 @@ def convert_observed_data(data) -> np.ndarray | Variable: """Convert user provided dataset to accepted formats.""" - if isgenerator(data): - return floatX(generator(data)) + return convert_generator_data(data) + return convert_data(data) + + +def convert_generator_data(data) -> GenTensorVariable: + return generator(data) + +def convert_data(data) -> np.ndarray | Variable: + ret: np.ndarray | Variable if hasattr(data, "to_numpy") and hasattr(data, "isnull"): # typically, but not limited to pandas objects vals = data.to_numpy() @@ -125,14 +135,10 @@ def convert_observed_data(data) -> np.ndarray | Variable: # type handling to enable index variables when data is int: if hasattr(data, "dtype"): - if "int" in str(data.dtype): - return intX(ret) - # otherwise, assume float: - else: - return floatX(ret) - # needed for uses of this function other than with pm.Data: - else: - return floatX(ret) + return smarttypeX(data) + # needed for uses of this function other than with pm.Data, + # which are primarily related to inferring shapes. + return floatX(ret) @_as_tensor_variable.register(pd.Series) diff --git a/tests/test_pytensorf.py b/tests/test_pytensorf.py index cce3d223ca..f03b92b14c 100644 --- a/tests/test_pytensorf.py +++ b/tests/test_pytensorf.py @@ -38,10 +38,12 @@ from pymc.exceptions import NotConstantValueError from pymc.logprob.utils import ParameterValueError from pymc.pytensorf import ( + GeneratorOp, collect_default_updates, compile_pymc, constant_fold, - convert_observed_data, + convert_data, + convert_generator_data, extract_obs_data, hessian, hessian_diag, @@ -188,9 +190,9 @@ def test_extract_obs_data(): @pytest.mark.parametrize("input_dtype", ["int32", "int64", "float32", "float64"]) -def test_convert_observed_data(input_dtype): +def test_convert_data(input_dtype): """ - Ensure that convert_observed_data returns the dense array, masked array, + Ensure that convert_data returns the dense array, masked array, graph variable, TensorVariable, or sparse matrix as appropriate. """ # Create the various inputs to the function @@ -206,12 +208,8 @@ def test_convert_observed_data(input_dtype): missing_pandas_input = pd.DataFrame(missing_numpy_input) masked_array_input = ma.array(dense_input, mask=(np.mod(dense_input, 2) == 0)) - # Create a generator object. Apparently the generator object needs to - # yield numpy arrays. - square_generator = (np.array([i**2], dtype=int) for i in range(100)) - # Alias the function to be tested - func = convert_observed_data + func = convert_data ##### # Perform the various tests @@ -255,21 +253,35 @@ def test_convert_observed_data(input_dtype): else: assert pytensor_output.dtype == intX - # Check function behavior with generator data - generator_output = func(square_generator) - # Output is wrapped with `pm.floatX`, and this unwraps - wrapped = generator_output.owner.inputs[0] - # Make sure the returned object has .set_gen and .set_default methods - assert hasattr(wrapped, "set_gen") - assert hasattr(wrapped, "set_default") +@pytest.mark.parametrize("input_dtype", ["int32", "int64", "float32", "float64"]) +def test_convert_generator_data(input_dtype): + # Create a generator object producing NumPy arrays with the intended dtype. + # This is required to infer the correct dtype. + square_generator = (np.array([i**2], dtype=input_dtype) for i in range(100)) + + # Output is NOT wrapped with `pm.floatX`/`intX`, + # but produced from calling a special Op. + result = convert_generator_data(square_generator) + apply = result.owner + op = apply.op # Make sure the returned object is an PyTensor TensorVariable - assert isinstance(wrapped, TensorVariable) + assert isinstance(result, TensorVariable) + assert isinstance(op, GeneratorOp), f"It's a {type(apply)}" + # There are no inputs - because it generates... + assert apply.inputs == [] + + # Evaluation results should have the correct* dtype! + # (*intX/floatX will be enforced!) + evaled = result.eval() + expected_dtype = pm.smarttypeX(np.array(1, dtype=input_dtype)).dtype + assert result.type.dtype == expected_dtype + assert evaled.dtype == np.dtype(expected_dtype) def test_pandas_to_array_pandas_index(): data = pd.Index([1, 2, 3]) - result = convert_observed_data(data) + result = convert_data(data) expected = np.array([1, 2, 3]) np.testing.assert_array_equal(result, expected)