ARROW-1342: [Python] Support strided ndarrays in pandas conversion from nested lists

wesm · wesm · commit 2972c9d3a0d3 · 2017-08-09T17:47:30.000-04:00
This does drop the vector append to the builder that was there before. I'm going to do some local benchmarking to make sure this doesn't degrade performance unacceptably, will report back here Author: Wes McKinney <wes.mckinney@twosigma.com> Closes apache#956 from wesm/ARROW-1342 and squashes the following commits: f2ebeba [Wes McKinney] Fix cpplint issue f403f9d [Wes McKinney] Fix test case to be platform independent, note ARROW-1345. Improve quality of error message f4f44c1 [Wes McKinney] Fix test case where inferred list type is null ae5c831 [Wes McKinney] Drop striding check b4aecd3 [Wes McKinney] Support strided ndarrays in pandas conversion from nested lists
diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h
@@ -25,6 +25,7 @@
 #include "arrow/python/platform.h"
 
 #include <cstdint>
+#include <string>
 
 namespace arrow {
 namespace py {
@@ -51,14 +52,54 @@ class Ndarray1DIndexer {
 
   int64_t size() const { return PyArray_SIZE(arr_); }
 
+  T* data() const { return data_; }
+
+  bool is_strided() const { return stride_ == 1; }
+
   T& operator[](size_type index) { return *(data_ + index * stride_); }
+  T& operator[](size_type index) const { return *(data_ + index * stride_); }
 
  private:
   PyArrayObject* arr_;
   T* data_;
   int64_t stride_;
 };
 
+static inline std::string GetNumPyTypeName(int npy_type) {
+#define TYPE_CASE(TYPE, NAME) \
+  case NPY_##TYPE:            \
+    return NAME;
+
+  switch (npy_type) {
+    TYPE_CASE(BOOL, "bool")
+    TYPE_CASE(INT8, "int8")
+    TYPE_CASE(INT16, "int16")
+    TYPE_CASE(INT32, "int32")
+    TYPE_CASE(INT64, "int64")
+#if (NPY_INT64 != NPY_LONGLONG)
+    TYPE_CASE(LONGLONG, "longlong")
+#endif
+    TYPE_CASE(UINT8, "uint8")
+    TYPE_CASE(UINT16, "uint16")
+    TYPE_CASE(UINT32, "uint32")
+    TYPE_CASE(UINT64, "uint64")
+#if (NPY_UINT64 != NPY_ULONGLONG)
+    TYPE_CASE(ULONGLONG, "ulonglong")
+#endif
+    TYPE_CASE(FLOAT16, "float16")
+    TYPE_CASE(FLOAT32, "float32")
+    TYPE_CASE(FLOAT64, "float64")
+    TYPE_CASE(DATETIME, "datetime64")
+    TYPE_CASE(OBJECT, "object")
+    TYPE_CASE(VOID, "void")
+    default:
+      break;
+  }
+
+#undef TYPE_CASE
+  return "unrecognized type in GetNumPyTypeName";
+}
+
 }  // namespace py
 }  // namespace arrow
 
diff --git a/cpp/src/arrow/python/pandas_to_arrow.cc b/cpp/src/arrow/python/pandas_to_arrow.cc
@@ -97,8 +97,6 @@ static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
   int64_t null_count = 0;
 
   Ndarray1DIndexer<T> values(arr);
-
-  // TODO(wesm): striding
   for (int i = 0; i < values.size(); ++i) {
     if (traits::isnull(values[i])) {
       ++null_count;
@@ -125,37 +123,42 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap
   return null_count;
 }
 
-template <int TYPE>
-static int64_t ValuesToValidBytes(const void* data, int64_t length,
-                                  uint8_t* valid_bytes) {
+template <int TYPE, typename BuilderType>
+static Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) {
   typedef internal::npy_traits<TYPE> traits;
   typedef typename traits::value_type T;
 
-  int64_t null_count = 0;
-  const T* values = reinterpret_cast<const T*>(data);
-
-  // TODO(wesm): striding
-  for (int i = 0; i < length; ++i) {
-    valid_bytes[i] = !traits::isnull(values[i]);
-    if (traits::isnull(values[i])) null_count++;
+  // TODO(wesm): Vector append when not strided
+  Ndarray1DIndexer<T> values(array);
+  if (traits::supports_nulls) {
+    for (int64_t i = 0; i < values.size(); ++i) {
+      if (traits::isnull(values[i])) {
+        RETURN_NOT_OK(builder->AppendNull());
+      } else {
+        RETURN_NOT_OK(builder->Append(values[i]));
+      }
+    }
+  } else {
+    for (int64_t i = 0; i < values.size(); ++i) {
+      RETURN_NOT_OK(builder->Append(values[i]));
+    }
   }
-
-  return null_count;
+  return Status::OK();
 }
 
 Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
   if (PyArray_NDIM(numpy_array) != 1) {
     return Status::Invalid("only handle 1-dimensional arrays");
   }
 
-  if (PyArray_DESCR(numpy_array)->type_num != np_type) {
-    return Status::Invalid("can only handle exact conversions");
+  const int received_type = PyArray_DESCR(numpy_array)->type_num;
+  if (received_type != np_type) {
+    std::stringstream ss;
+    ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got "
+       << GetNumPyTypeName(received_type);
+    return Status::Invalid(ss.str());
   }
 
-  npy_intp* astrides = PyArray_STRIDES(numpy_array);
-  if (astrides[0] != PyArray_DESCR(numpy_array)->elsize) {
-    return Status::Invalid("No support for strided arrays in lists yet");
-  }
   return Status::OK();
 }
 
@@ -577,7 +580,7 @@ Status PandasConverter::ConvertDecimals() {
   RETURN_NOT_OK(ImportModule("decimal", &decimal));
   RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal));
 
-  PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+  Ndarray1DIndexer<PyObject*> objects(arr_);
   PyObject* object = objects[0];
 
   int precision;
@@ -618,7 +621,7 @@ Status PandasConverter::ConvertTimes() {
   PyAcquireGIL lock;
   PyDateTime_IMPORT;
 
-  PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
+  Ndarray1DIndexer<PyObject*> objects(arr_);
 
   // datetime.time stores microsecond resolution
   Time64Builder builder(::arrow::time64(TimeUnit::MICRO), pool_);
@@ -906,7 +909,7 @@ Status LoopPySequence(PyObject* sequence, T func) {
     Py_ssize_t size = PySequence_Size(sequence);
     if (PyArray_Check(sequence)) {
       auto array = reinterpret_cast<PyArrayObject*>(sequence);
-      PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(array));
+      Ndarray1DIndexer<PyObject*> objects(array);
       for (int64_t i = 0; i < size; ++i) {
         RETURN_NOT_OK(func(objects[i]));
       }
@@ -934,7 +937,6 @@ template <int ITEM_TYPE, typename ArrowType>
 inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>& type,
                                                  ListBuilder* builder, PyObject* list) {
   typedef internal::npy_traits<ITEM_TYPE> traits;
-  typedef typename traits::value_type T;
   typedef typename traits::BuilderClass BuilderT;
 
   PyAcquireGIL lock;
@@ -956,24 +958,13 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>
       // TODO(uwe): Support more complex numpy array structures
       RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE));
 
-      int64_t size = PyArray_DIM(numpy_array, 0);
-      auto data = reinterpret_cast<const T*>(PyArray_DATA(numpy_array));
-      if (traits::supports_nulls) {
-        RETURN_NOT_OK(null_bitmap_->Resize(size, false));
-        // TODO(uwe): A bitmap would be more space-efficient but the Builder API doesn't
-        // currently support this.
-        // ValuesToBitmap<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
-        ValuesToValidBytes<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
-        return value_builder->Append(data, size, null_bitmap_->data());
-      } else {
-        return value_builder->Append(data, size);
-      }
+      return AppendNdarrayToBuilder<ITEM_TYPE, BuilderT>(numpy_array, value_builder);
     } else if (PyList_Check(object)) {
       int64_t size;
       std::shared_ptr<DataType> inferred_type;
       RETURN_NOT_OK(builder->Append(true));
       RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type));
-      if (inferred_type->id() != type->id()) {
+      if (inferred_type->id() != Type::NA && inferred_type->id() != type->id()) {
         std::stringstream ss;
         ss << inferred_type->ToString() << " cannot be converted to " << type->ToString();
         return Status::TypeError(ss.str());
@@ -1064,7 +1055,7 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
       std::shared_ptr<DataType> inferred_type;
       RETURN_NOT_OK(builder->Append(true));
       RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type));
-      if (inferred_type->id() != Type::STRING) {
+      if (inferred_type->id() != Type::NA && inferred_type->id() != Type::STRING) {
         std::stringstream ss;
         ss << inferred_type->ToString() << " cannot be converted to STRING.";
         return Status::TypeError(ss.str());
diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc
@@ -301,8 +301,8 @@ Table::Table(const std::shared_ptr<Schema>& schema,
 
   columns_.resize(columns.size());
   for (size_t i = 0; i < columns.size(); ++i) {
-    columns_[i] = std::make_shared<Column>(schema->field(static_cast<int>(i)),
-                                           columns[i]);
+    columns_[i] =
+        std::make_shared<Column>(schema->field(static_cast<int>(i)), columns[i]);
   }
 }
 
diff --git a/python/pyarrow/error.pxi b/python/pyarrow/error.pxi
@@ -65,7 +65,7 @@ cdef int check_status(const CStatus& status) nogil except -1:
         return 0
 
     with gil:
-        message = frombytes(status.ToString())
+        message = frombytes(status.message())
         if status.IsInvalid():
             raise ArrowInvalid(message)
         elif status.IsIOError():
@@ -85,4 +85,5 @@ cdef int check_status(const CStatus& status) nogil except -1:
         elif status.IsPlasmaStoreFull():
             raise PlasmaStoreFull(message)
         else:
+            message = frombytes(status.ToString())
             raise ArrowException(message)
diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd
@@ -42,6 +42,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         CStatus()
 
         c_string ToString()
+        c_string message()
 
         c_bool ok()
         c_bool IsIOError()
diff --git a/python/pyarrow/tests/pandas_examples.py b/python/pyarrow/tests/pandas_examples.py
@@ -98,21 +98,25 @@ def dataframe_with_lists(include_index=False):
         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
         [0, 1, 2, 3, 4],
         None,
-        [0]
+        [0],
+        np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
+                 dtype=np.int64)[::2]
     ]
     fields.append(pa.field('double', pa.list_(pa.float64())))
     arrays['double'] = [
         [0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
         [0., 1., 2., 3., 4.],
         None,
-        [0.]
+        [0.],
+        np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
     ]
     fields.append(pa.field('str_list', pa.list_(pa.string())))
     arrays['str_list'] = [
         [u"1", u"ä"],
         None,
         [u"1"],
-        [u"1", u"2", u"3"]
+        [u"1", u"2", u"3"],
+        [],
     ]
 
     if include_index:
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
@@ -534,6 +534,15 @@ def test_column_of_lists(self):
             field = schema.field_by_name(column)
             self._check_array_roundtrip(df[column], type=field.type)
 
+    def test_column_of_lists_strided(self):
+        df, schema = dataframe_with_lists()
+        df = pd.concat([df] * 6, ignore_index=True)
+
+        arr = df['int64'].values[::3]
+        assert arr.strides[0] != 8
+
+        self._check_array_roundtrip(arr)
+
     def test_nested_lists_all_none(self):
         data = np.array([[None, None], None], dtype=object)
 

Original file line number	Diff line number	Diff line change
`@@ -301,8 +301,8 @@ Table::Table(const std::shared_ptr<Schema>& schema,`
`301`	`301`
`302`	`302`	`columns_.resize(columns.size());`
`303`	`303`	`for (size_t i = 0; i < columns.size(); ++i) {`
`304`		`- columns_[i] = std::make_shared<Column>(schema->field(static_cast<int>(i)),`
`305`		`- columns[i]);`
	`304`	`+ columns_[i] =`
	`305`	`+ std::make_shared<Column>(schema->field(static_cast<int>(i)), columns[i]);`
`306`	`306`	`}`
`307`	`307`	`}`
`308`	`308`