Skip to content

Commit 2972c9d

Browse files
committed
ARROW-1342: [Python] Support strided ndarrays in pandas conversion from nested lists
This does drop the vector append to the builder that was there before. I'm going to do some local benchmarking to make sure this doesn't degrade performance unacceptably, will report back here Author: Wes McKinney <wes.mckinney@twosigma.com> Closes apache#956 from wesm/ARROW-1342 and squashes the following commits: f2ebeba [Wes McKinney] Fix cpplint issue f403f9d [Wes McKinney] Fix test case to be platform independent, note ARROW-1345. Improve quality of error message f4f44c1 [Wes McKinney] Fix test case where inferred list type is null ae5c831 [Wes McKinney] Drop striding check b4aecd3 [Wes McKinney] Support strided ndarrays in pandas conversion from nested lists
1 parent e44ede8 commit 2972c9d

File tree

7 files changed

+91
-44
lines changed

7 files changed

+91
-44
lines changed

cpp/src/arrow/python/numpy-internal.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "arrow/python/platform.h"
2626

2727
#include <cstdint>
28+
#include <string>
2829

2930
namespace arrow {
3031
namespace py {
@@ -51,14 +52,54 @@ class Ndarray1DIndexer {
5152

5253
int64_t size() const { return PyArray_SIZE(arr_); }
5354

55+
T* data() const { return data_; }
56+
57+
bool is_strided() const { return stride_ == 1; }
58+
5459
T& operator[](size_type index) { return *(data_ + index * stride_); }
60+
T& operator[](size_type index) const { return *(data_ + index * stride_); }
5561

5662
private:
5763
PyArrayObject* arr_;
5864
T* data_;
5965
int64_t stride_;
6066
};
6167

68+
static inline std::string GetNumPyTypeName(int npy_type) {
69+
#define TYPE_CASE(TYPE, NAME) \
70+
case NPY_##TYPE: \
71+
return NAME;
72+
73+
switch (npy_type) {
74+
TYPE_CASE(BOOL, "bool")
75+
TYPE_CASE(INT8, "int8")
76+
TYPE_CASE(INT16, "int16")
77+
TYPE_CASE(INT32, "int32")
78+
TYPE_CASE(INT64, "int64")
79+
#if (NPY_INT64 != NPY_LONGLONG)
80+
TYPE_CASE(LONGLONG, "longlong")
81+
#endif
82+
TYPE_CASE(UINT8, "uint8")
83+
TYPE_CASE(UINT16, "uint16")
84+
TYPE_CASE(UINT32, "uint32")
85+
TYPE_CASE(UINT64, "uint64")
86+
#if (NPY_UINT64 != NPY_ULONGLONG)
87+
TYPE_CASE(ULONGLONG, "ulonglong")
88+
#endif
89+
TYPE_CASE(FLOAT16, "float16")
90+
TYPE_CASE(FLOAT32, "float32")
91+
TYPE_CASE(FLOAT64, "float64")
92+
TYPE_CASE(DATETIME, "datetime64")
93+
TYPE_CASE(OBJECT, "object")
94+
TYPE_CASE(VOID, "void")
95+
default:
96+
break;
97+
}
98+
99+
#undef TYPE_CASE
100+
return "unrecognized type in GetNumPyTypeName";
101+
}
102+
62103
} // namespace py
63104
} // namespace arrow
64105

cpp/src/arrow/python/pandas_to_arrow.cc

Lines changed: 29 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,6 @@ static int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) {
9797
int64_t null_count = 0;
9898

9999
Ndarray1DIndexer<T> values(arr);
100-
101-
// TODO(wesm): striding
102100
for (int i = 0; i < values.size(); ++i) {
103101
if (traits::isnull(values[i])) {
104102
++null_count;
@@ -125,37 +123,42 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap
125123
return null_count;
126124
}
127125

128-
template <int TYPE>
129-
static int64_t ValuesToValidBytes(const void* data, int64_t length,
130-
uint8_t* valid_bytes) {
126+
template <int TYPE, typename BuilderType>
127+
static Status AppendNdarrayToBuilder(PyArrayObject* array, BuilderType* builder) {
131128
typedef internal::npy_traits<TYPE> traits;
132129
typedef typename traits::value_type T;
133130

134-
int64_t null_count = 0;
135-
const T* values = reinterpret_cast<const T*>(data);
136-
137-
// TODO(wesm): striding
138-
for (int i = 0; i < length; ++i) {
139-
valid_bytes[i] = !traits::isnull(values[i]);
140-
if (traits::isnull(values[i])) null_count++;
131+
// TODO(wesm): Vector append when not strided
132+
Ndarray1DIndexer<T> values(array);
133+
if (traits::supports_nulls) {
134+
for (int64_t i = 0; i < values.size(); ++i) {
135+
if (traits::isnull(values[i])) {
136+
RETURN_NOT_OK(builder->AppendNull());
137+
} else {
138+
RETURN_NOT_OK(builder->Append(values[i]));
139+
}
140+
}
141+
} else {
142+
for (int64_t i = 0; i < values.size(); ++i) {
143+
RETURN_NOT_OK(builder->Append(values[i]));
144+
}
141145
}
142-
143-
return null_count;
146+
return Status::OK();
144147
}
145148

146149
Status CheckFlatNumpyArray(PyArrayObject* numpy_array, int np_type) {
147150
if (PyArray_NDIM(numpy_array) != 1) {
148151
return Status::Invalid("only handle 1-dimensional arrays");
149152
}
150153

151-
if (PyArray_DESCR(numpy_array)->type_num != np_type) {
152-
return Status::Invalid("can only handle exact conversions");
154+
const int received_type = PyArray_DESCR(numpy_array)->type_num;
155+
if (received_type != np_type) {
156+
std::stringstream ss;
157+
ss << "trying to convert NumPy type " << GetNumPyTypeName(np_type) << " but got "
158+
<< GetNumPyTypeName(received_type);
159+
return Status::Invalid(ss.str());
153160
}
154161

155-
npy_intp* astrides = PyArray_STRIDES(numpy_array);
156-
if (astrides[0] != PyArray_DESCR(numpy_array)->elsize) {
157-
return Status::Invalid("No support for strided arrays in lists yet");
158-
}
159162
return Status::OK();
160163
}
161164

@@ -577,7 +580,7 @@ Status PandasConverter::ConvertDecimals() {
577580
RETURN_NOT_OK(ImportModule("decimal", &decimal));
578581
RETURN_NOT_OK(ImportFromModule(decimal, "Decimal", &Decimal));
579582

580-
PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
583+
Ndarray1DIndexer<PyObject*> objects(arr_);
581584
PyObject* object = objects[0];
582585

583586
int precision;
@@ -618,7 +621,7 @@ Status PandasConverter::ConvertTimes() {
618621
PyAcquireGIL lock;
619622
PyDateTime_IMPORT;
620623

621-
PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_));
624+
Ndarray1DIndexer<PyObject*> objects(arr_);
622625

623626
// datetime.time stores microsecond resolution
624627
Time64Builder builder(::arrow::time64(TimeUnit::MICRO), pool_);
@@ -906,7 +909,7 @@ Status LoopPySequence(PyObject* sequence, T func) {
906909
Py_ssize_t size = PySequence_Size(sequence);
907910
if (PyArray_Check(sequence)) {
908911
auto array = reinterpret_cast<PyArrayObject*>(sequence);
909-
PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(array));
912+
Ndarray1DIndexer<PyObject*> objects(array);
910913
for (int64_t i = 0; i < size; ++i) {
911914
RETURN_NOT_OK(func(objects[i]));
912915
}
@@ -934,7 +937,6 @@ template <int ITEM_TYPE, typename ArrowType>
934937
inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>& type,
935938
ListBuilder* builder, PyObject* list) {
936939
typedef internal::npy_traits<ITEM_TYPE> traits;
937-
typedef typename traits::value_type T;
938940
typedef typename traits::BuilderClass BuilderT;
939941

940942
PyAcquireGIL lock;
@@ -956,24 +958,13 @@ inline Status PandasConverter::ConvertTypedLists(const std::shared_ptr<DataType>
956958
// TODO(uwe): Support more complex numpy array structures
957959
RETURN_NOT_OK(CheckFlatNumpyArray(numpy_array, ITEM_TYPE));
958960

959-
int64_t size = PyArray_DIM(numpy_array, 0);
960-
auto data = reinterpret_cast<const T*>(PyArray_DATA(numpy_array));
961-
if (traits::supports_nulls) {
962-
RETURN_NOT_OK(null_bitmap_->Resize(size, false));
963-
// TODO(uwe): A bitmap would be more space-efficient but the Builder API doesn't
964-
// currently support this.
965-
// ValuesToBitmap<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
966-
ValuesToValidBytes<ITEM_TYPE>(data, size, null_bitmap_->mutable_data());
967-
return value_builder->Append(data, size, null_bitmap_->data());
968-
} else {
969-
return value_builder->Append(data, size);
970-
}
961+
return AppendNdarrayToBuilder<ITEM_TYPE, BuilderT>(numpy_array, value_builder);
971962
} else if (PyList_Check(object)) {
972963
int64_t size;
973964
std::shared_ptr<DataType> inferred_type;
974965
RETURN_NOT_OK(builder->Append(true));
975966
RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type));
976-
if (inferred_type->id() != type->id()) {
967+
if (inferred_type->id() != Type::NA && inferred_type->id() != type->id()) {
977968
std::stringstream ss;
978969
ss << inferred_type->ToString() << " cannot be converted to " << type->ToString();
979970
return Status::TypeError(ss.str());
@@ -1064,7 +1055,7 @@ inline Status PandasConverter::ConvertTypedLists<NPY_OBJECT, StringType>(
10641055
std::shared_ptr<DataType> inferred_type;
10651056
RETURN_NOT_OK(builder->Append(true));
10661057
RETURN_NOT_OK(InferArrowTypeAndSize(object, &size, &inferred_type));
1067-
if (inferred_type->id() != Type::STRING) {
1058+
if (inferred_type->id() != Type::NA && inferred_type->id() != Type::STRING) {
10681059
std::stringstream ss;
10691060
ss << inferred_type->ToString() << " cannot be converted to STRING.";
10701061
return Status::TypeError(ss.str());

cpp/src/arrow/table.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -301,8 +301,8 @@ Table::Table(const std::shared_ptr<Schema>& schema,
301301

302302
columns_.resize(columns.size());
303303
for (size_t i = 0; i < columns.size(); ++i) {
304-
columns_[i] = std::make_shared<Column>(schema->field(static_cast<int>(i)),
305-
columns[i]);
304+
columns_[i] =
305+
std::make_shared<Column>(schema->field(static_cast<int>(i)), columns[i]);
306306
}
307307
}
308308

python/pyarrow/error.pxi

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ cdef int check_status(const CStatus& status) nogil except -1:
6565
return 0
6666

6767
with gil:
68-
message = frombytes(status.ToString())
68+
message = frombytes(status.message())
6969
if status.IsInvalid():
7070
raise ArrowInvalid(message)
7171
elif status.IsIOError():
@@ -85,4 +85,5 @@ cdef int check_status(const CStatus& status) nogil except -1:
8585
elif status.IsPlasmaStoreFull():
8686
raise PlasmaStoreFull(message)
8787
else:
88+
message = frombytes(status.ToString())
8889
raise ArrowException(message)

python/pyarrow/includes/common.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
4242
CStatus()
4343

4444
c_string ToString()
45+
c_string message()
4546

4647
c_bool ok()
4748
c_bool IsIOError()

python/pyarrow/tests/pandas_examples.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,21 +98,25 @@ def dataframe_with_lists(include_index=False):
9898
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
9999
[0, 1, 2, 3, 4],
100100
None,
101-
[0]
101+
[0],
102+
np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9] * 2,
103+
dtype=np.int64)[::2]
102104
]
103105
fields.append(pa.field('double', pa.list_(pa.float64())))
104106
arrays['double'] = [
105107
[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
106108
[0., 1., 2., 3., 4.],
107109
None,
108-
[0.]
110+
[0.],
111+
np.array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.] * 2)[::2],
109112
]
110113
fields.append(pa.field('str_list', pa.list_(pa.string())))
111114
arrays['str_list'] = [
112115
[u"1", u"ä"],
113116
None,
114117
[u"1"],
115-
[u"1", u"2", u"3"]
118+
[u"1", u"2", u"3"],
119+
[],
116120
]
117121

118122
if include_index:

python/pyarrow/tests/test_convert_pandas.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,15 @@ def test_column_of_lists(self):
534534
field = schema.field_by_name(column)
535535
self._check_array_roundtrip(df[column], type=field.type)
536536

537+
def test_column_of_lists_strided(self):
538+
df, schema = dataframe_with_lists()
539+
df = pd.concat([df] * 6, ignore_index=True)
540+
541+
arr = df['int64'].values[::3]
542+
assert arr.strides[0] != 8
543+
544+
self._check_array_roundtrip(arr)
545+
537546
def test_nested_lists_all_none(self):
538547
data = np.array([[None, None], None], dtype=object)
539548

0 commit comments

Comments
 (0)