forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Arrow] Unifying Arrow conversion handling, fixing handling of >2Gb o…
…bjects (ray-project#48487) ## Why are these changes needed? Addresses ray-project#48419 Currently, we defer to Pyarrow to infer corresponding data-type to represent column values returned by the Map-based operators. However, Arrow is somehow not inferring the `large_*` kinds of types even in somewhat trivial cases of strings, byte-strings etc. resulting in `ArrowCapacityError` when you try to add a single string/byte-string >2Gb. This change addresses that by - Unifying handling of conversion to Numpy/Arrow in a single place (unifying it across different code-paths) - Fixing incorrect fallbacks to `ArrowPythonObjectType` - Upscaling `binary`, `string` to their Large counterparts (ie `large_list`, etc) if objects we're adding to the Arrow array > 2Gb <!-- Please give a short summary of the change and the problem this solves. --> --------- Signed-off-by: Alexey Kudinkin <ak@anyscale.com>
- Loading branch information
1 parent
73c956e
commit b5934e5
Showing
26 changed files
with
602 additions
and
204 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
from dataclasses import dataclass, field | ||
|
||
import pyarrow as pa | ||
import pytest | ||
|
||
from ray.air.util.tensor_extensions.arrow import ( | ||
ArrowConversionError, | ||
_convert_to_pyarrow_native_array, | ||
_infer_pyarrow_type, | ||
convert_to_pyarrow_array, | ||
) | ||
from ray.air.util.tensor_extensions.utils import create_ragged_ndarray | ||
|
||
|
||
@dataclass | ||
class UserObj: | ||
i: int = field() | ||
|
||
|
||
def test_pa_infer_type_failing_to_infer(): | ||
# Represent a single column that will be using `ArrowPythonObjectExtension` type | ||
# to ser/de native Python objects into bytes | ||
column_vals = create_ragged_ndarray( | ||
[ | ||
"hi", | ||
1, | ||
None, | ||
[[[[]]]], | ||
{"a": [[{"b": 2, "c": UserObj(i=123)}]]}, | ||
UserObj(i=456), | ||
] | ||
) | ||
|
||
inferred_dtype = _infer_pyarrow_type(column_vals) | ||
|
||
# Arrow (17.0) seem to fallback to assume the dtype of the first element | ||
assert pa.string().equals(inferred_dtype) | ||
|
||
|
||
def test_convert_to_pyarrow_array_object_ext_type_fallback(): | ||
column_values = create_ragged_ndarray( | ||
[ | ||
"hi", | ||
1, | ||
None, | ||
[[[[]]]], | ||
{"a": [[{"b": 2, "c": UserObj(i=123)}]]}, | ||
UserObj(i=456), | ||
] | ||
) | ||
column_name = "py_object_column" | ||
|
||
# First, assert that straightforward conversion into Arrow native types fails | ||
with pytest.raises(ArrowConversionError) as exc_info: | ||
_convert_to_pyarrow_native_array(column_values, column_name) | ||
|
||
assert ( | ||
str(exc_info.value) | ||
== "Error converting data to Arrow: ['hi' 1 None list([[[[]]]]) {'a': [[{'b': 2, 'c': UserObj(i=123)}]]}\n UserObj(i=456)]" # noqa: E501 | ||
) | ||
|
||
# Subsequently, assert that fallback to `ArrowObjectExtensionType` succeeds | ||
pa_array = convert_to_pyarrow_array(column_values, column_name) | ||
|
||
assert pa_array.to_pylist() == column_values.tolist() | ||
|
||
|
||
if __name__ == "__main__": | ||
import sys | ||
|
||
sys.exit(pytest.main(["-v", "-x", __file__])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.