Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
8ccf0e9
New C++ compute kernels development framework project
wesm May 6, 2020
0b88a49
Finish porting SortToIndices, consolidate files and tests since there…
wesm May 19, 2020
13e0b4b
Refactoring selection kernels, not yet ported
wesm May 19, 2020
65165d6
filter/take compiling finally
wesm May 19, 2020
e4b0db3
Fix UBSAN/ASAN issues, take and filter kernels completely working now
wesm May 19, 2020
1c89e38
Port hash kernels, adapt execution framework to suit
wesm May 20, 2020
4f3e769
Fix memory allocation issue caught by ASAN
wesm May 20, 2020
b4098be
Some progress refactoring cast kernels
wesm May 20, 2020
4f84a7d
Progress building casting infrastructure
wesm May 20, 2020
e62c7fa
Cast tests compiling again
wesm May 20, 2020
1e9d0fc
Fix compare_benchmark.cc
wesm May 20, 2020
7039e3d
Don't yield empty batches from ExecBatchIterator
wesm May 20, 2020
f7da85f
Most casts working now, working on remaining
wesm May 21, 2020
23665d0
All tests passing again
wesm May 21, 2020
af13070
Update Python bindings, refine error types
wesm May 21, 2020
a10e773
Port R C++ code, though there is a failing test
wesm May 21, 2020
1285e24
Initial round of code review comments, add a bunch of comments
wesm May 22, 2020
e1ab1bd
Initial emkornfield comments
wesm May 22, 2020
2970658
Fix buglets with default exec chunksize change
wesm May 22, 2020
f6e82d2
Fix testing submodule
wesm May 22, 2020
3e32f1c
Initial MSVC compilation fixes
wesm May 22, 2020
f9233c6
More MSVC fixes
wesm May 22, 2020
fd28670
clang-format
wesm May 22, 2020
78bdf7c
Introduce TypeMatcher abstraction for input type checking and add imp…
wesm May 22, 2020
d7c4c39
Add missing timestamp kernels to Take and Filter
wesm May 22, 2020
1318d4c
Some cleanup, add missing static breaking some builds
wesm May 22, 2020
36ddf6d
Restore use of ChunkedArray for value_set in set lookup operations Is…
wesm May 22, 2020
c8394b0
[GLib] Use the new API
kou May 22, 2020
4cfafe5
Start working on Python bindings for new kernels API
wesm May 22, 2020
4ab5fa8
Fix IsInChunkedArrayInvoke test
wesm May 22, 2020
273a0a6
Simple Python bindings for new compute functionality. Unit tests must…
wesm May 22, 2020
7264af4
Fix build error with -DCMAKE_UNITY_BUILD=ON
kou May 22, 2020
c690506
Fix unit tests per minor API changes
wesm May 22, 2020
7288f45
Use size_t for hash type
kou May 22, 2020
0833b5c
Use size_t for kHashSeed
kou May 22, 2020
729a3be
Add missing ARROW_EXPORT
kou May 23, 2020
94184d1
Add missing "#pragma once"
kou May 23, 2020
c2090d9
Fix failure on VS 2019 where input bitmap was all 0 by chance
wesm May 23, 2020
8653503
Simplify some Python method implementations. Do not register unnecess…
wesm May 23, 2020
cb85735
Try again to fix failing VS 2019 unit test
wesm May 23, 2020
877618f
Do not crash on null options passed to filter function. Add prototype…
wesm May 24, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
626 changes: 211 additions & 415 deletions c_glib/arrow-glib/compute.cpp

Large diffs are not rendered by default.

58 changes: 28 additions & 30 deletions c_glib/arrow-glib/error.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,18 @@ G_BEGIN_DECLS

G_DEFINE_QUARK(garrow-error-quark, garrow_error)

static GArrowError
garrow_error_code(const arrow::Status &status)
G_END_DECLS

gboolean
garrow_error_check(GError **error,
const arrow::Status &status,
const char *context)
{
return garrow::check(error, status, context);
}

GArrowError
garrow_error_from_status(const arrow::Status &status)
{
switch (status.code()) {
case arrow::StatusCode::OK:
Expand Down Expand Up @@ -73,50 +83,38 @@ garrow_error_code(const arrow::Status &status)
return GARROW_ERROR_EXECUTION;
case arrow::StatusCode::AlreadyExists:
return GARROW_ERROR_ALREADY_EXISTS;

default:
return GARROW_ERROR_UNKNOWN;
}
}

G_END_DECLS
arrow::Status
garrow_error_to_status(GError *error,
arrow::StatusCode code,
const char *context)
{
std::stringstream message;
message << context << ": " << g_quark_to_string(error->domain);
message << "(" << error->code << "): ";
message << error->message;
g_error_free(error);
return arrow::Status(code, message.str());
}

namespace garrow {
gboolean
check(GError **error,
const arrow::Status &status,
const char *context) {
gboolean check(GError **error,
const arrow::Status &status,
const char *context) {
if (status.ok()) {
return TRUE;
} else {
g_set_error(error,
GARROW_ERROR,
garrow_error_code(status),
garrow_error_from_status(status),
"%s: %s",
context,
status.ToString().c_str());
return FALSE;
}
}
}

gboolean
garrow_error_check(GError **error,
const arrow::Status &status,
const char *context)
{
return garrow::check(error, status, context);
}

arrow::Status
garrow_error_to_status(GError *error,
arrow::StatusCode code,
const char *context)
{
std::stringstream message;
message << context << ": " << g_quark_to_string(error->domain);
message << "(" << error->code << "): ";
message << error->message;
g_error_free(error);
return arrow::Status(code, message.str());
}
46 changes: 34 additions & 12 deletions c_glib/arrow-glib/error.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,26 +23,48 @@

#include <arrow-glib/error.h>

gboolean garrow_error_check(GError **error,
const arrow::Status &status,
const char *context);
GArrowError garrow_error_from_status(const arrow::Status &status);
arrow::Status garrow_error_to_status(GError *error,
arrow::StatusCode code,
const char *context);

namespace garrow {
gboolean check(GError **error,
const arrow::Status &status,
const char *context);

template <typename TYPE>
template <typename CONTEXT_FUNC>
gboolean check(GError **error,
const arrow::Result<TYPE> &result,
const char *context) {
if (result.ok()) {
const arrow::Status &status,
CONTEXT_FUNC &&context_func) {
if (status.ok()) {
return TRUE;
} else {
return check(error, result.status(), context);
std::string context = std::move(context_func());
g_set_error(error,
GARROW_ERROR,
garrow_error_from_status(status),
"%s: %s",
context.c_str(),
status.ToString().c_str());
return FALSE;
}
}
}

gboolean garrow_error_check(GError **error,
const arrow::Status &status,
const char *context);
arrow::Status garrow_error_to_status(GError *error,
arrow::StatusCode code,
const char *context);
template <typename TYPE>
gboolean check(GError **error,
const arrow::Result<TYPE> &result,
const char *context) {
return check(error, result.status(), context);
}

template <typename TYPE, typename CONTEXT_FUNC>
gboolean check(GError **error,
const arrow::Result<TYPE> &result,
CONTEXT_FUNC &&context_func) {
return check(error, result.status(), context_func);
}
}
46 changes: 27 additions & 19 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ endfunction()

set(ARROW_SRCS
array.cc
builder.cc
array/builder_adaptive.cc
array/builder_base.cc
array/builder_binary.cc
Expand All @@ -134,8 +133,10 @@ set(ARROW_SRCS
array/dict_internal.cc
array/diff.cc
array/validate.cc
builder.cc
buffer.cc
compare.cc
datum.cc
device.cc
extension_type.cc
memory_pool.cc
Expand Down Expand Up @@ -319,24 +320,30 @@ endif()

if(ARROW_COMPUTE)
list(APPEND ARROW_SRCS
compute/context.cc
compute/kernels/aggregate.cc
compute/kernels/boolean.cc
compute/kernels/cast.cc
compute/kernels/compare.cc
compute/kernels/count.cc
compute/kernels/hash.cc
compute/kernels/filter.cc
compute/kernels/mean.cc
compute/kernels/minmax.cc
compute/kernels/sort_to_indices.cc
compute/kernels/nth_to_indices.cc
compute/kernels/sum.cc
compute/kernels/add.cc
compute/kernels/take.cc
compute/kernels/isin.cc
compute/kernels/match.cc
compute/kernels/util_internal.cc)
compute/api_aggregate.cc
compute/api_scalar.cc
compute/api_vector.cc
compute/cast.cc
compute/exec.cc
compute/function.cc
compute/kernel.cc
compute/registry.cc
compute/kernels/aggregate_basic.cc
compute/kernels/codegen_internal.cc
compute/kernels/scalar_arithmetic.cc
compute/kernels/scalar_boolean.cc
compute/kernels/scalar_cast_boolean.cc
compute/kernels/scalar_cast_internal.cc
compute/kernels/scalar_cast_nested.cc
compute/kernels/scalar_cast_numeric.cc
compute/kernels/scalar_cast_string.cc
compute/kernels/scalar_cast_temporal.cc
compute/kernels/scalar_compare.cc
compute/kernels/scalar_set_lookup.cc
compute/kernels/vector_filter.cc
compute/kernels/vector_hash.cc
compute/kernels/vector_sort.cc
compute/kernels/vector_take.cc)
endif()

if(ARROW_FILESYSTEM)
Expand Down Expand Up @@ -524,6 +531,7 @@ endif()

add_arrow_test(misc_test
SOURCES
datum_test.cc
memory_pool_test.cc
result_test.cc
pretty_print_test.cc
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/adapters/orc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ elseif(NOT MSVC)
set(ORC_MIN_TEST_LIBS ${ORC_MIN_TEST_LIBS} pthread ${CMAKE_DL_LIBS})
endif()

set(ORC_STATIC_TEST_LINK_LIBS ${ORC_MIN_TEST_LIBS} ${ARROW_LIBRARIES_FOR_STATIC_TESTS}
set(ORC_STATIC_TEST_LINK_LIBS ${ARROW_LIBRARIES_FOR_STATIC_TESTS} ${ORC_MIN_TEST_LIBS}
orc::liborc)

add_arrow_test(adapter_test
Expand Down
13 changes: 5 additions & 8 deletions cpp/src/arrow/array/diff_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@
#include "arrow/array/diff.h"
#include "arrow/buffer.h"
#include "arrow/builder.h"
#include "arrow/compute/context.h"
#include "arrow/compute/kernels/filter.h"
#include "arrow/compute/api.h"
#include "arrow/status.h"
#include "arrow/testing/gtest_common.h"
#include "arrow/testing/random.h"
Expand Down Expand Up @@ -119,20 +118,19 @@ class DiffTest : public ::testing::Test {

void BaseAndTargetFromRandomFilter(std::shared_ptr<Array> values,
double filter_probability) {
compute::Datum out_datum, base_filter, target_filter;
std::shared_ptr<Array> base_filter, target_filter;
do {
base_filter = this->rng_.Boolean(values->length(), filter_probability, 0.0);
target_filter = this->rng_.Boolean(values->length(), filter_probability, 0.0);
} while (base_filter.Equals(target_filter));
} while (base_filter->Equals(target_filter));

ASSERT_OK(compute::Filter(&ctx_, values, base_filter, {}, &out_datum));
ASSERT_OK_AND_ASSIGN(Datum out_datum, compute::Filter(values, base_filter));
base_ = out_datum.make_array();

ASSERT_OK(compute::Filter(&ctx_, values, target_filter, {}, &out_datum));
ASSERT_OK_AND_ASSIGN(out_datum, compute::Filter(values, target_filter));
target_ = out_datum.make_array();
}

compute::FunctionContext ctx_;
random::RandomArrayGenerator rng_;
std::shared_ptr<StructArray> edits_;
std::shared_ptr<Array> base_, target_;
Expand Down Expand Up @@ -616,7 +614,6 @@ void MakeSameLength(std::shared_ptr<Array>* a, std::shared_ptr<Array>* b) {
}

TEST_F(DiffTest, CompareRandomStruct) {
compute::FunctionContext ctx;
for (auto null_probability : {0.0, 0.25}) {
constexpr auto length = 1 << 10;
auto int32_values = this->rng_.Int32(length, 0, 127, null_probability);
Expand Down
9 changes: 7 additions & 2 deletions cpp/src/arrow/compute/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@ function(ADD_ARROW_COMPUTE_TEST REL_TEST_NAME)
${ARG_UNPARSED_ARGUMENTS})
endfunction()

add_arrow_compute_test(compute_test)
add_arrow_benchmark(compute_benchmark)
add_arrow_compute_test(internals_test
SOURCES
function_test.cc
exec_test.cc
kernel_test.cc
registry_test.cc)
add_arrow_compute_test(exec_test)

add_subdirectory(kernels)
58 changes: 58 additions & 0 deletions cpp/src/arrow/compute/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
<!---
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->

## Apache Arrow C++ Compute Functions

This submodule contains analytical functions that process primarily Arrow
columnar data; some functions can process scalar or Arrow-based array
inputs. These are intended for use inside query engines, data frame libraries,
etc.

Many functions have SQL-like semantics in that they perform elementwise or
scalar operations on whole arrays at a time. Other functions are not SQL-like
and compute results that may be a different length or whose results depend on
the order of the values.

Some basic terminology:

* We use the term "function" to refer to particular general operation that may
have many different implementations corresponding to different combinations
of types or function behavior options.
* We call a specific implementation of a function a "kernel". When executing a
function on inputs, we must first select a suitable kernel (kernel selection
is called "dispatching") corresponding to the value types of the inputs
* Functions along with their kernel implementations are collected in a
"function registry". Given a function name and argument types, we can look up
that function and dispatch to a compatible kernel.

Types of functions

* Scalar functions: elementwise functions that perform scalar operations in a
vectorized manner. These functions are generally valid for SQL-like
context. These are called "scalar" in that the functions executed consider
each value in an array independently, and the output array or arrays have the
same length as the input arrays. The result for each array cell is generally
independent of its position in the array.
* Vector functions, which produce a result whose output is generally dependent
on the entire contents of the input arrays. These functions **are generally
not valid** for SQL-like processing because the output size may be different
than the input size, and the result may change based on the order of the
values in the array. This includes things like array subselection, sorting,
hashing, and more.
* Scalar aggregate functions of which can be used in a SQL-like context
27 changes: 12 additions & 15 deletions cpp/src/arrow/compute/api.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,17 @@
// specific language governing permissions and limitations
// under the License.

#pragma once
// NOTE: API is EXPERIMENTAL and will change without going through a
// deprecation cycle

#include "arrow/compute/context.h" // IWYU pragma: export
#include "arrow/compute/kernel.h" // IWYU pragma: export
#pragma once

#include "arrow/compute/kernels/boolean.h" // IWYU pragma: export
#include "arrow/compute/kernels/cast.h" // IWYU pragma: export
#include "arrow/compute/kernels/compare.h" // IWYU pragma: export
#include "arrow/compute/kernels/count.h" // IWYU pragma: export
#include "arrow/compute/kernels/filter.h" // IWYU pragma: export
#include "arrow/compute/kernels/hash.h" // IWYU pragma: export
#include "arrow/compute/kernels/isin.h" // IWYU pragma: export
#include "arrow/compute/kernels/mean.h" // IWYU pragma: export
#include "arrow/compute/kernels/nth_to_indices.h" // IWYU pragma: export
#include "arrow/compute/kernels/sort_to_indices.h" // IWYU pragma: export
#include "arrow/compute/kernels/sum.h" // IWYU pragma: export
#include "arrow/compute/kernels/take.h" // IWYU pragma: export
#include "arrow/compute/api_aggregate.h" // IWYU pragma: export
#include "arrow/compute/api_scalar.h" // IWYU pragma: export
#include "arrow/compute/api_vector.h" // IWYU pragma: export
#include "arrow/compute/cast.h" // IWYU pragma: export
#include "arrow/compute/exec.h" // IWYU pragma: export
#include "arrow/compute/function.h" // IWYU pragma: export
#include "arrow/compute/kernel.h" // IWYU pragma: export
#include "arrow/compute/registry.h" // IWYU pragma: export
#include "arrow/datum.h" // IWYU pragma: export
Loading