ARROW-7723: [Python] Triage untested functional regression when converting tz-aware timestamp inside struct to pandas/NumPy format

wesm · kszucs · commit c00dfe8a39ab · 2020-01-30T21:15:27.000+01:00
This is definitely a hack so I left notes everywhere and explained the problem in detail on the ARROW-7723 JIRA. This reverts behavior in this particular case to 0.15.1 but there is an inconsistency in the data types returned by `struct<f0: timestamp[us]>` versus `struct<f0: timestamp[ns]>`. The former returns a dict with `datetime.datetime` values while the latter has `int64` values (because nanoseconds can't be safely coerced to `datetime.datetime` in general). It seems undesirable at the moment to have any downstream user depending on one behavior or the other at the moment but leaving the behavior unchanged for now (given that many systems -- e.g. Spark SQL -- don't have nanoseconds) seems acceptable. Closes apache#6322 from wesm/ARROW-7723 and squashes the following commits: 05ab21b <Wes McKinney> pin pandas 0.25.3 2178977 <Wes McKinney> Add longer code comment explaining the hack 7142699 <Wes McKinney> Gross hack to avoid unwanted nanosecond promotion with tz-aware timestamp types 123d4b8 <Wes McKinney> Add failing unit test Authored-by: Wes McKinney <wesm+git@apache.org> Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile
@@ -22,9 +22,11 @@ FROM ${repo}:${arch}-conda-python-${python}
 
 ARG jdk=8
 ARG maven=3.5
+
+# The Spark tests currently break with pandas >= 1.0
 RUN conda install -q \
         patch \
-        pandas \
+        pandas=0.25.3 \
         openjdk=${jdk} \
         maven=${maven} && \
     conda clean --all
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -619,13 +619,24 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da
   auto array_type = arr->type();
   std::vector<OwnedRef> fields_data(num_fields);
   OwnedRef dict_item;
+
+  // XXX(wesm): In ARROW-7723, we found as a result of ARROW-3789 that second
+  // through microsecond resolution tz-aware timestamps were being promoted to
+  // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy
+  // array in this function. PyArray_GETITEM returns datetime.datetime for
+  // units second through microsecond but PyLong for nanosecond (because
+  // datetime.datetime does not support nanoseconds). We inserted this hack to
+  // preserve the <= 0.15.1 behavior until a better solution can be devised
+  PandasOptions modified_options = options;
+  modified_options.ignore_timezone = true;
+
   for (int c = 0; c < data.num_chunks(); c++) {
     auto arr = checked_cast<const StructArray*>(data.chunk(c).get());
     // Convert the struct arrays first
     for (int32_t i = 0; i < num_fields; i++) {
       PyObject* numpy_array;
-      RETURN_NOT_OK(ConvertArrayToPandas(options, arr->field(static_cast<int>(i)),
-                                         nullptr, &numpy_array));
+      RETURN_NOT_OK(ConvertArrayToPandas(
+          modified_options, arr->field(static_cast<int>(i)), nullptr, &numpy_array));
       fields_data[i].reset(numpy_array);
     }
 
@@ -1678,7 +1689,8 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
       break;
     case Type::TIMESTAMP: {
       const auto& ts_type = checked_cast<const TimestampType&>(*data.type());
-      if (ts_type.timezone() != "") {
+      // XXX: Hack here for ARROW-7723
+      if (ts_type.timezone() != "" && !options.ignore_timezone) {
         *output_type = PandasWriter::DATETIME_NANO_TZ;
       } else if (options.coerce_temporal_nanoseconds) {
         *output_type = PandasWriter::DATETIME_NANO;
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -55,6 +55,10 @@ struct PandasOptions {
   /// Coerce all date and timestamp to datetime64[ns]
   bool coerce_temporal_nanoseconds = false;
 
+  /// XXX(wesm): Hack for ARROW-7723 to opt out of DATETIME_NANO_TZ conversion
+  /// path
+  bool ignore_timezone = false;
+
   /// \brief If true, do not create duplicate PyObject versions of equal
   /// objects. This only applies to immutable objects like strings or datetime
   /// objects
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -3211,6 +3211,28 @@ def test_cast_timestamp_unit():
     assert result.equals(expected)
 
 
+def test_struct_with_timestamp_tz():
+    # ARROW-7723
+    ts = pd.Timestamp.now()
+
+    # XXX: Ensure that this data does not get promoted to nanoseconds (and thus
+    # integers) to preserve behavior in 0.15.1
+    for unit in ['s', 'ms', 'us']:
+        arr = pa.array([ts], type=pa.timestamp(unit))
+        arr2 = pa.array([ts], type=pa.timestamp(unit, tz='America/New_York'))
+
+        arr3 = pa.StructArray.from_arrays([arr, arr], ['start', 'stop'])
+        arr4 = pa.StructArray.from_arrays([arr2, arr2], ['start', 'stop'])
+
+        result = arr3.to_pandas()
+        assert isinstance(result[0]['start'], datetime)
+        assert isinstance(result[0]['stop'], datetime)
+
+        result = arr4.to_pandas()
+        assert isinstance(result[0]['start'], datetime)
+        assert isinstance(result[0]['stop'], datetime)
+
+
 # ----------------------------------------------------------------------
 # DictionaryArray tests