Skip to content
This repository was archived by the owner on Oct 21, 2024. It is now read-only.

Commit c00dfe8

Browse files
wesmkszucs
authored andcommitted
ARROW-7723: [Python] Triage untested functional regression when converting tz-aware timestamp inside struct to pandas/NumPy format
This is definitely a hack so I left notes everywhere and explained the problem in detail on the ARROW-7723 JIRA. This reverts behavior in this particular case to 0.15.1 but there is an inconsistency in the data types returned by `struct<f0: timestamp[us]>` versus `struct<f0: timestamp[ns]>`. The former returns a dict with `datetime.datetime` values while the latter has `int64` values (because nanoseconds can't be safely coerced to `datetime.datetime` in general). It seems undesirable at the moment to have any downstream user depending on one behavior or the other at the moment but leaving the behavior unchanged for now (given that many systems -- e.g. Spark SQL -- don't have nanoseconds) seems acceptable. Closes apache#6322 from wesm/ARROW-7723 and squashes the following commits: 05ab21b <Wes McKinney> pin pandas 0.25.3 2178977 <Wes McKinney> Add longer code comment explaining the hack 7142699 <Wes McKinney> Gross hack to avoid unwanted nanosecond promotion with tz-aware timestamp types 123d4b8 <Wes McKinney> Add failing unit test Authored-by: Wes McKinney <wesm+git@apache.org> Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
1 parent 7ee6936 commit c00dfe8

File tree

4 files changed

+44
-4
lines changed

4 files changed

+44
-4
lines changed

ci/docker/conda-python-spark.dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,11 @@ FROM ${repo}:${arch}-conda-python-${python}
2222

2323
ARG jdk=8
2424
ARG maven=3.5
25+
26+
# The Spark tests currently break with pandas >= 1.0
2527
RUN conda install -q \
2628
patch \
27-
pandas \
29+
pandas=0.25.3 \
2830
openjdk=${jdk} \
2931
maven=${maven} && \
3032
conda clean --all

cpp/src/arrow/python/arrow_to_pandas.cc

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -619,13 +619,24 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da
619619
auto array_type = arr->type();
620620
std::vector<OwnedRef> fields_data(num_fields);
621621
OwnedRef dict_item;
622+
623+
// XXX(wesm): In ARROW-7723, we found as a result of ARROW-3789 that second
624+
// through microsecond resolution tz-aware timestamps were being promoted to
625+
// use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy
626+
// array in this function. PyArray_GETITEM returns datetime.datetime for
627+
// units second through microsecond but PyLong for nanosecond (because
628+
// datetime.datetime does not support nanoseconds). We inserted this hack to
629+
// preserve the <= 0.15.1 behavior until a better solution can be devised
630+
PandasOptions modified_options = options;
631+
modified_options.ignore_timezone = true;
632+
622633
for (int c = 0; c < data.num_chunks(); c++) {
623634
auto arr = checked_cast<const StructArray*>(data.chunk(c).get());
624635
// Convert the struct arrays first
625636
for (int32_t i = 0; i < num_fields; i++) {
626637
PyObject* numpy_array;
627-
RETURN_NOT_OK(ConvertArrayToPandas(options, arr->field(static_cast<int>(i)),
628-
nullptr, &numpy_array));
638+
RETURN_NOT_OK(ConvertArrayToPandas(
639+
modified_options, arr->field(static_cast<int>(i)), nullptr, &numpy_array));
629640
fields_data[i].reset(numpy_array);
630641
}
631642

@@ -1678,7 +1689,8 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
16781689
break;
16791690
case Type::TIMESTAMP: {
16801691
const auto& ts_type = checked_cast<const TimestampType&>(*data.type());
1681-
if (ts_type.timezone() != "") {
1692+
// XXX: Hack here for ARROW-7723
1693+
if (ts_type.timezone() != "" && !options.ignore_timezone) {
16821694
*output_type = PandasWriter::DATETIME_NANO_TZ;
16831695
} else if (options.coerce_temporal_nanoseconds) {
16841696
*output_type = PandasWriter::DATETIME_NANO;

cpp/src/arrow/python/arrow_to_pandas.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@ struct PandasOptions {
5555
/// Coerce all date and timestamp to datetime64[ns]
5656
bool coerce_temporal_nanoseconds = false;
5757

58+
/// XXX(wesm): Hack for ARROW-7723 to opt out of DATETIME_NANO_TZ conversion
59+
/// path
60+
bool ignore_timezone = false;
61+
5862
/// \brief If true, do not create duplicate PyObject versions of equal
5963
/// objects. This only applies to immutable objects like strings or datetime
6064
/// objects

python/pyarrow/tests/test_pandas.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3211,6 +3211,28 @@ def test_cast_timestamp_unit():
32113211
assert result.equals(expected)
32123212

32133213

3214+
def test_struct_with_timestamp_tz():
3215+
# ARROW-7723
3216+
ts = pd.Timestamp.now()
3217+
3218+
# XXX: Ensure that this data does not get promoted to nanoseconds (and thus
3219+
# integers) to preserve behavior in 0.15.1
3220+
for unit in ['s', 'ms', 'us']:
3221+
arr = pa.array([ts], type=pa.timestamp(unit))
3222+
arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York'))
3223+
3224+
arr3 = pa.StructArray.from_arrays([arr, arr], ['start', 'stop'])
3225+
arr4 = pa.StructArray.from_arrays([arr2, arr2], ['start', 'stop'])
3226+
3227+
result = arr3.to_pandas()
3228+
assert isinstance(result[0]['start'], datetime)
3229+
assert isinstance(result[0]['stop'], datetime)
3230+
3231+
result = arr4.to_pandas()
3232+
assert isinstance(result[0]['start'], datetime)
3233+
assert isinstance(result[0]['stop'], datetime)
3234+
3235+
32143236
# ----------------------------------------------------------------------
32153237
# DictionaryArray tests
32163238

0 commit comments

Comments
 (0)