Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
File renamed without changes.
File renamed without changes.
3 changes: 2 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,8 @@ endif (UNIX)
if (${CLANG_FORMAT_FOUND})
# runs clang format and updates files in place.
add_custom_target(format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 1
`find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`)
`find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`
`find ${CMAKE_CURRENT_SOURCE_DIR}/../python -name \\*.cc -or -name \\*.h`)

# runs clang format and exits with a non-zero exit code if any files need to be reformatted
add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 0
Expand Down
5 changes: 4 additions & 1 deletion python/pyarrow/includes/pyarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# distutils: language = c++

from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn,
from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, CTable,
CDataType, CStatus, Type, MemoryPool)

cimport pyarrow.includes.libarrow_io as arrow_io
Expand All @@ -39,6 +39,9 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr,
PyObject* py_ref, PyObject** out)

CStatus ConvertTableToPandas(const shared_ptr[CTable]& table,
int nthreads, PyObject** out)

MemoryPool* get_memory_pool()


Expand Down
54 changes: 42 additions & 12 deletions python/pyarrow/table.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,32 @@ cdef class RecordBatch:
return result


cdef table_to_blockmanager(const shared_ptr[CTable]& table, int nthreads):
cdef:
PyObject* result_obj
CColumn* col
int i

from pandas.core.internals import BlockManager, make_block
from pandas import RangeIndex

check_status(pyarrow.ConvertTableToPandas(table, nthreads, &result_obj))

result = PyObject_to_object(result_obj)

blocks = []
for block_arr, placement_arr in result:
blocks.append(make_block(block_arr, placement=placement_arr))

names = []
for i in range(table.get().num_columns()):
col = table.get().column(i).get()
names.append(frombytes(col.name()))

axes = [names, RangeIndex(table.get().num_rows())]
return BlockManager(blocks, axes)


cdef class Table:
"""
A collection of top-level named, equal length Arrow arrays.
Expand Down Expand Up @@ -584,7 +610,7 @@ cdef class Table:
table.init(c_table)
return table

def to_pandas(self):
def to_pandas(self, nthreads=1, block_based=True):
"""
Convert the arrow::Table to a pandas DataFrame

Expand All @@ -599,17 +625,21 @@ cdef class Table:

import pandas as pd

names = []
data = []
for i in range(self.table.num_columns()):
col = self.table.column(i)
column = self.column(i)
check_status(pyarrow.ConvertColumnToPandas(
col, <PyObject*> column, &arr))
names.append(frombytes(col.get().name()))
data.append(PyObject_to_object(arr))

return pd.DataFrame(dict(zip(names, data)), columns=names)
if block_based:
mgr = table_to_blockmanager(self.sp_table, nthreads)
return pd.DataFrame(mgr)
else:
names = []
data = []
for i in range(self.table.num_columns()):
col = self.table.column(i)
column = self.column(i)
check_status(pyarrow.ConvertColumnToPandas(
col, <PyObject*> column, &arr))
names.append(frombytes(col.get().name()))
data.append(PyObject_to_object(arr))

return pd.DataFrame(dict(zip(names, data)), columns=names)

@property
def name(self):
Expand Down
66 changes: 27 additions & 39 deletions python/src/pyarrow/adapters/builtin.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,16 +44,16 @@ static inline bool IsPyInteger(PyObject* obj) {

class ScalarVisitor {
public:
ScalarVisitor() :
total_count_(0),
none_count_(0),
bool_count_(0),
int_count_(0),
date_count_(0),
timestamp_count_(0),
float_count_(0),
binary_count_(0),
unicode_count_(0) {}
ScalarVisitor()
: total_count_(0),
none_count_(0),
bool_count_(0),
int_count_(0),
date_count_(0),
timestamp_count_(0),
float_count_(0),
binary_count_(0),
unicode_count_(0) {}

void Visit(PyObject* obj) {
++total_count_;
Expand Down Expand Up @@ -100,9 +100,7 @@ class ScalarVisitor {
}
}

int64_t total_count() const {
return total_count_;
}
int64_t total_count() const { return total_count_; }

private:
int64_t total_count_;
Expand All @@ -123,17 +121,14 @@ static constexpr int MAX_NESTING_LEVELS = 32;

class SeqVisitor {
public:
SeqVisitor() :
max_nesting_level_(0) {
SeqVisitor() : max_nesting_level_(0) {
memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int));
}

Status Visit(PyObject* obj, int level=0) {
Status Visit(PyObject* obj, int level = 0) {
Py_ssize_t size = PySequence_Size(obj);

if (level > max_nesting_level_) {
max_nesting_level_ = level;
}
if (level > max_nesting_level_) { max_nesting_level_ = level; }

for (int64_t i = 0; i < size; ++i) {
// TODO(wesm): Error checking?
Expand Down Expand Up @@ -188,19 +183,15 @@ class SeqVisitor {
int max_observed_level() const {
int result = 0;
for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
if (nesting_histogram_[i] > 0) {
result = i;
}
if (nesting_histogram_[i] > 0) { result = i; }
}
return result;
}

int num_nesting_levels() const {
int result = 0;
for (int i = 0; i < MAX_NESTING_LEVELS; ++i) {
if (nesting_histogram_[i] > 0) {
++result;
}
if (nesting_histogram_[i] > 0) { ++result; }
}
return result;
}
Expand All @@ -214,8 +205,8 @@ class SeqVisitor {
};

// Non-exhaustive type inference
static Status InferArrowType(PyObject* obj, int64_t* size,
std::shared_ptr<DataType>* out_type) {
static Status InferArrowType(
PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) {
*size = PySequence_Size(obj);
if (PyErr_Occurred()) {
// Not a sequence
Expand All @@ -224,19 +215,15 @@ static Status InferArrowType(PyObject* obj, int64_t* size,
}

// For 0-length sequences, refuse to guess
if (*size == 0) {
*out_type = arrow::null();
}
if (*size == 0) { *out_type = arrow::null(); }

SeqVisitor seq_visitor;
RETURN_NOT_OK(seq_visitor.Visit(obj));
RETURN_NOT_OK(seq_visitor.Validate());

*out_type = seq_visitor.GetType();

if (*out_type == nullptr) {
return Status::TypeError("Unable to determine data type");
}
if (*out_type == nullptr) { return Status::TypeError("Unable to determine data type"); }

return Status::OK();
}
Expand Down Expand Up @@ -337,7 +324,8 @@ class TimestampConverter : public TypedConverter<arrow::TimestampBuilder> {
if (item.obj() == Py_None) {
typed_builder_->AppendNull();
} else {
PyDateTime_DateTime* pydatetime = reinterpret_cast<PyDateTime_DateTime*>(item.obj());
PyDateTime_DateTime* pydatetime =
reinterpret_cast<PyDateTime_DateTime*>(item.obj());
struct tm datetime = {0};
datetime.tm_year = PyDateTime_GET_YEAR(pydatetime) - 1900;
datetime.tm_mon = PyDateTime_GET_MONTH(pydatetime) - 1;
Expand Down Expand Up @@ -462,6 +450,7 @@ class ListConverter : public TypedConverter<arrow::ListBuilder> {
}
return Status::OK();
}

protected:
std::shared_ptr<SeqConverter> value_converter_;
};
Expand Down Expand Up @@ -496,8 +485,8 @@ Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) {
builder_ = builder;
typed_builder_ = static_cast<arrow::ListBuilder*>(builder.get());

value_converter_ = GetConverter(static_cast<arrow::ListType*>(
builder->type().get())->value_type());
value_converter_ =
GetConverter(static_cast<arrow::ListType*>(builder->type().get())->value_type());
if (value_converter_ == nullptr) {
return Status::NotImplemented("value type not implemented");
}
Expand All @@ -521,8 +510,7 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out) {
std::shared_ptr<SeqConverter> converter = GetConverter(type);
if (converter == nullptr) {
std::stringstream ss;
ss << "No type converter implemented for "
<< type->ToString();
ss << "No type converter implemented for " << type->ToString();
return Status::NotImplemented(ss.str());
}

Expand All @@ -536,4 +524,4 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out) {
return builder->Finish(out);
}

} // namespace pyarrow
} // namespace pyarrow
4 changes: 2 additions & 2 deletions python/src/pyarrow/adapters/builtin.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,6 @@ namespace pyarrow {
PYARROW_EXPORT
arrow::Status ConvertPySequence(PyObject* obj, std::shared_ptr<arrow::Array>* out);

} // namespace pyarrow
} // namespace pyarrow

#endif // PYARROW_ADAPTERS_BUILTIN_H
#endif // PYARROW_ADAPTERS_BUILTIN_H
Loading