Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/build-support/run_cpplint.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ def _check_some_files(completed_processes, filenames):
if problem_files:
msg = "{} had cpplint issues"
print("\n".join(map(msg.format, problem_files)))
if isinstance(stdout, bytes):
stdout = stdout.decode('utf8')
print(stdout, file=sys.stderr)
error = True
except Exception:
Expand Down
1 change: 1 addition & 0 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,7 @@ if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS)
endif()

set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix/src/googletest_ep")
set(GTEST_HOME ${GTEST_PREFIX})
set(GTEST_INCLUDE_DIR "${GTEST_PREFIX}/include")
set(GTEST_STATIC_LIB
"${GTEST_PREFIX}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_GTEST_DEBUG_EXTENSION}${CMAKE_STATIC_LIBRARY_SUFFIX}")
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -90,11 +90,12 @@ set(ARROW_SRCS
memory_pool.cc
pretty_print.cc
record_batch.cc
scalar.cc
sparse_tensor.cc
status.cc
table.cc
table_builder.cc
tensor.cc
sparse_tensor.cc
type.cc
visitor.cc
csv/converter.cc
Expand Down Expand Up @@ -306,6 +307,7 @@ add_arrow_test(buffer-test)
add_arrow_test(memory_pool-test)
add_arrow_test(pretty_print-test)
add_arrow_test(public-api-test)
add_arrow_test(scalar-test)
add_arrow_test(status-test)
add_arrow_test(stl-test)
add_arrow_test(type-test)
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/arrow/compute/compute-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,16 @@ void CheckImplicitConstructor(enum Datum::type expected_kind) {
}

TEST(TestDatum, ImplicitConstructors) {
CheckImplicitConstructor<Scalar>(Datum::SCALAR);

CheckImplicitConstructor<Array>(Datum::ARRAY);

// Instantiate from array subclass
CheckImplicitConstructor<BinaryArray>(Datum::ARRAY);

CheckImplicitConstructor<ChunkedArray>(Datum::CHUNKED_ARRAY);
CheckImplicitConstructor<RecordBatch>(Datum::RECORD_BATCH);

CheckImplicitConstructor<Table>(Datum::TABLE);
}

Expand Down
61 changes: 10 additions & 51 deletions cpp/src/arrow/compute/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

#include "arrow/array.h"
#include "arrow/record_batch.h"
#include "arrow/scalar.h"
#include "arrow/table.h"
#include "arrow/util/macros.h"
#include "arrow/util/variant.h" // IWYU pragma: export
Expand Down Expand Up @@ -55,68 +56,20 @@ class ARROW_EXPORT OpKernel {
virtual ~OpKernel() = default;
};

/// \brief Placeholder for Scalar values until we implement these
struct ARROW_EXPORT Scalar {
util::variant<bool, uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, uint64_t,
int64_t, float, double>
value;

explicit Scalar(bool value) : value(value) {}
explicit Scalar(uint8_t value) : value(value) {}
explicit Scalar(int8_t value) : value(value) {}
explicit Scalar(uint16_t value) : value(value) {}
explicit Scalar(int16_t value) : value(value) {}
explicit Scalar(uint32_t value) : value(value) {}
explicit Scalar(int32_t value) : value(value) {}
explicit Scalar(uint64_t value) : value(value) {}
explicit Scalar(int64_t value) : value(value) {}
explicit Scalar(float value) : value(value) {}
explicit Scalar(double value) : value(value) {}

Type::type kind() const {
switch (this->value.which()) {
case 0:
return Type::BOOL;
case 1:
return Type::UINT8;
case 2:
return Type::INT8;
case 3:
return Type::UINT16;
case 4:
return Type::INT16;
case 5:
return Type::UINT32;
case 6:
return Type::INT32;
case 7:
return Type::UINT64;
case 8:
return Type::INT64;
case 9:
return Type::FLOAT;
case 10:
return Type::DOUBLE;
default:
return Type::NA;
}
}
};

/// \class Datum
/// \brief Variant type for various Arrow C++ data structures
struct ARROW_EXPORT Datum {
enum type { NONE, SCALAR, ARRAY, CHUNKED_ARRAY, RECORD_BATCH, TABLE, COLLECTION };

util::variant<decltype(NULLPTR), Scalar, std::shared_ptr<ArrayData>,
util::variant<decltype(NULLPTR), std::shared_ptr<Scalar>, std::shared_ptr<ArrayData>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

given scalars are supposed to be "small" it might be worth comment on why you use a shared_ptr type here (i.e. instead of the previous variant)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • Scalars of nested types may be semantically small but physically occupy a fairly large amount of memory
  • The object model uses subclasses. So you need to use a pointer type of some kind here

std::shared_ptr<ChunkedArray>, std::shared_ptr<RecordBatch>,
std::shared_ptr<Table>, std::vector<Datum>>
value;

/// \brief Empty datum, to be populated elsewhere
Datum() : value(NULLPTR) {}

Datum(const Scalar& value) // NOLINT implicit conversion
Datum(const std::shared_ptr<Scalar>& value) // NOLINT implicit conversion
: value(value) {}
Datum(const std::shared_ptr<ArrayData>& value) // NOLINT implicit conversion
: value(value) {}
Expand Down Expand Up @@ -188,14 +141,18 @@ struct ARROW_EXPORT Datum {
return util::get<std::vector<Datum>>(this->value);
}

Scalar scalar() const { return util::get<Scalar>(this->value); }
std::shared_ptr<Scalar> scalar() const {
return util::get<std::shared_ptr<Scalar>>(this->value);
}

bool is_array() const { return this->kind() == Datum::ARRAY; }

bool is_arraylike() const {
return this->kind() == Datum::ARRAY || this->kind() == Datum::CHUNKED_ARRAY;
}

bool is_scalar() const { return this->kind() == Datum::SCALAR; }

/// \brief The value type of the variant, if any
///
/// \return nullptr if no type
Expand All @@ -204,6 +161,8 @@ struct ARROW_EXPORT Datum {
return util::get<std::shared_ptr<ArrayData>>(this->value)->type;
} else if (this->kind() == Datum::CHUNKED_ARRAY) {
return util::get<std::shared_ptr<ChunkedArray>>(this->value)->type();
} else if (this->kind() == Datum::SCALAR) {
return util::get<std::shared_ptr<Scalar>>(this->value)->type;
}
return NULLPTR;
}
Expand Down
69 changes: 36 additions & 33 deletions cpp/src/arrow/compute/kernels/aggregate-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "arrow/compute/kernels/sum.h"
#include "arrow/compute/test-util.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/checked_cast.h"

#include "arrow/testing/gtest_common.h"
#include "arrow/testing/gtest_util.h"
Expand All @@ -36,47 +38,47 @@ using std::vector;
namespace arrow {
namespace compute {

template <typename CType, typename Enable = void>
template <typename Type, typename Enable = void>
struct DatumEqual {
static void EnsureEqual(const Datum& lhs, const Datum& rhs) {}
};

template <typename CType>
struct DatumEqual<CType,
typename std::enable_if<std::is_floating_point<CType>::value>::type> {
template <typename Type>
struct DatumEqual<Type, typename std::enable_if<IsFloatingPoint<Type>::Value>::type> {
static constexpr double kArbitraryDoubleErrorBound = 1.0;
using ScalarType = typename TypeTraits<Type>::ScalarType;

static void EnsureEqual(const Datum& lhs, const Datum& rhs) {
ASSERT_EQ(lhs.kind(), rhs.kind());
if (lhs.kind() == Datum::SCALAR) {
ASSERT_EQ(lhs.scalar().kind(), rhs.scalar().kind());
ASSERT_NEAR(util::get<CType>(lhs.scalar().value),
util::get<CType>(rhs.scalar().value), kArbitraryDoubleErrorBound);
auto left = static_cast<const ScalarType*>(lhs.scalar().get());
auto right = static_cast<const ScalarType*>(rhs.scalar().get());
ASSERT_EQ(left->type->id(), right->type->id());
ASSERT_NEAR(left->value, right->value, kArbitraryDoubleErrorBound);
}
}
};

template <typename CType>
struct DatumEqual<CType,
typename std::enable_if<!std::is_floating_point<CType>::value>::type> {
template <typename Type>
struct DatumEqual<Type, typename std::enable_if<!IsFloatingPoint<Type>::value>::type> {
using ScalarType = typename TypeTraits<Type>::ScalarType;
static void EnsureEqual(const Datum& lhs, const Datum& rhs) {
ASSERT_EQ(lhs.kind(), rhs.kind());
if (lhs.kind() == Datum::SCALAR) {
ASSERT_EQ(lhs.scalar().kind(), rhs.scalar().kind());
ASSERT_EQ(util::get<CType>(lhs.scalar().value),
util::get<CType>(rhs.scalar().value));
auto left = static_cast<const ScalarType*>(lhs.scalar().get());
auto right = static_cast<const ScalarType*>(rhs.scalar().get());
ASSERT_EQ(left->type->id(), right->type->id());
ASSERT_EQ(left->value, right->value);
}
}
};

template <typename ArrowType>
void ValidateSum(FunctionContext* ctx, const Array& input, Datum expected) {
using CType = typename ArrowType::c_type;
using SumType = typename FindAccumulatorType<CType>::Type;

using OutputType = typename FindAccumulatorType<ArrowType>::Type;
Datum result;
ASSERT_OK(Sum(ctx, input, &result));
DatumEqual<SumType>::EnsureEqual(result, expected);
DatumEqual<OutputType>::EnsureEqual(result, expected);
}

template <typename ArrowType>
Expand All @@ -87,11 +89,11 @@ void ValidateSum(FunctionContext* ctx, const char* json, Datum expected) {

template <typename ArrowType>
static Datum DummySum(const Array& array) {
using CType = typename ArrowType::c_type;
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
using SumType = typename FindAccumulatorType<CType>::Type;
using SumType = typename FindAccumulatorType<ArrowType>::Type;
using SumScalarType = typename TypeTraits<SumType>::ScalarType;

SumType sum = 0;
typename SumType::c_type sum = 0;
int64_t count = 0;

const auto& array_numeric = reinterpret_cast<const ArrayType&>(array);
Expand All @@ -104,7 +106,11 @@ static Datum DummySum(const Array& array) {
}
}

return (count > 0) ? Datum(Scalar(sum)) : Datum();
if (count > 0) {
return Datum(std::make_shared<SumScalarType>(sum));
} else {
return Datum(std::make_shared<SumScalarType>(0, false));
}
}

template <typename ArrowType>
Expand All @@ -115,24 +121,21 @@ void ValidateSum(FunctionContext* ctx, const Array& array) {
template <typename ArrowType>
class TestSumKernelNumeric : public ComputeFixture, public TestBase {};

typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
UInt32Type, Int64Type, UInt64Type, FloatType, DoubleType>
NumericArrowTypes;

TYPED_TEST_CASE(TestSumKernelNumeric, NumericArrowTypes);
TYPED_TEST(TestSumKernelNumeric, SimpleSum) {
using CType = typename TypeParam::c_type;
using SumType = typename FindAccumulatorType<CType>::Type;
using SumType = typename FindAccumulatorType<TypeParam>::Type;
using ScalarType = typename TypeTraits<SumType>::ScalarType;
using T = typename TypeParam::c_type;

ValidateSum<TypeParam>(&this->ctx_, "[]", Datum());
ValidateSum<TypeParam>(&this->ctx_, "[]",
Datum(std::make_shared<ScalarType>(0, false)));

ValidateSum<TypeParam>(&this->ctx_, "[0, 1, 2, 3, 4, 5]",
Datum(Scalar(static_cast<SumType>(5 * 6 / 2))));
Datum(std::make_shared<ScalarType>(static_cast<T>(5 * 6 / 2))));

// Avoid this tests for (U)Int8Type
if (sizeof(CType) > 1)
ValidateSum<TypeParam>(&this->ctx_, "[1000, null, 300, null, 30, null, 7]",
Datum(Scalar(static_cast<SumType>(1337))));
const T expected_result = static_cast<T>(14);
ValidateSum<TypeParam>(&this->ctx_, "[1, null, 3, null, 3, null, 7]",
Datum(std::make_shared<ScalarType>(expected_result)));
}

template <typename ArrowType>
Expand Down
23 changes: 17 additions & 6 deletions cpp/src/arrow/compute/kernels/sum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@
namespace arrow {
namespace compute {

template <typename CType, typename SumType = typename FindAccumulatorType<CType>::Type>
template <typename ArrowType,
typename SumType = typename FindAccumulatorType<ArrowType>::Type>
struct SumState {
using ThisType = SumState<CType, SumType>;
using ThisType = SumState<ArrowType, SumType>;

ThisType operator+(const ThisType& rhs) const {
return ThisType(this->count + rhs.count, this->sum + rhs.sum);
Expand All @@ -43,11 +44,16 @@ struct SumState {
return *this;
}

std::shared_ptr<Scalar> AsScalar() const {
using ScalarType = typename TypeTraits<SumType>::ScalarType;
return std::make_shared<ScalarType>(this->sum);
}

size_t count = 0;
SumType sum = 0;
typename SumType::c_type sum = 0;
};

template <typename ArrowType, typename StateType = SumState<typename ArrowType::c_type>>
template <typename ArrowType, typename StateType = SumState<ArrowType>>
class SumAggregateFunction final : public AggregateFunctionStaticState<StateType> {
using CType = typename TypeTraits<ArrowType>::CType;
using ArrayType = typename TypeTraits<ArrowType>::ArrayType;
Expand All @@ -71,7 +77,12 @@ class SumAggregateFunction final : public AggregateFunctionStaticState<StateType
}

Status Finalize(const StateType& src, Datum* output) const override {
*output = (src.count > 0) ? Datum(Scalar(src.sum)) : Datum();
auto boxed = src.AsScalar();
if (src.count == 0) {
// TODO(wesm): Currently null, but fix this
boxed->is_valid = false;
}
*output = boxed;
return Status::OK();
}

Expand Down Expand Up @@ -185,7 +196,7 @@ Status Sum(FunctionContext* ctx, const Datum& value, Datum* out) {
}

Status Sum(FunctionContext* ctx, const Array& array, Datum* out) {
return Sum(ctx, Datum(array.data()), out);
return Sum(ctx, array.data(), out);
}

} // namespace compute
Expand Down
19 changes: 9 additions & 10 deletions cpp/src/arrow/compute/kernels/sum.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <type_traits>

#include "arrow/status.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/visibility.h"

namespace arrow {
Expand All @@ -34,25 +36,22 @@ namespace compute {
// Find the largest compatible primitive type for a primitive type.
template <typename I, typename Enable = void>
struct FindAccumulatorType {
using Type = double;
using Type = DoubleType;
};

template <typename I>
struct FindAccumulatorType<I, typename std::enable_if<std::is_integral<I>::value &&
std::is_signed<I>::value>::type> {
using Type = int64_t;
struct FindAccumulatorType<I, typename std::enable_if<IsSignedInt<I>::value>::type> {
using Type = Int64Type;
};

template <typename I>
struct FindAccumulatorType<I, typename std::enable_if<std::is_integral<I>::value &&
std::is_unsigned<I>::value>::type> {
using Type = uint64_t;
struct FindAccumulatorType<I, typename std::enable_if<IsUnsignedInt<I>::value>::type> {
using Type = UInt64Type;
};

template <typename I>
struct FindAccumulatorType<
I, typename std::enable_if<std::is_floating_point<I>::value>::type> {
using Type = double;
struct FindAccumulatorType<I, typename std::enable_if<IsFloatingPoint<I>::value>::type> {
using Type = DoubleType;
};

struct Datum;
Expand Down
Loading