Skip to content

Commit ed6ec3b

Browse files
committed
ARROW-373: [C++] JSON serialization format for testing
C++ version of ARROW-372 Author: Wes McKinney <wes.mckinney@twosigma.com> Closes #202 from wesm/ARROW-373 and squashes the following commits: d13a05f [Wes McKinney] Compiler warning 72c24fe [Wes McKinney] Add a minimal literal JSON example a2cf47b [Wes McKinney] cpplint 3d9fcc2 [Wes McKinney] Complete round trip json file test with multiple record batches 2753449 [Wes McKinney] Complete draft json roundtrip implementation. tests not complete yet 3d6bbbd [Wes McKinney] Start high level writer scaffold 6bbd669 [Wes McKinney] Tweaks e2e86b5 [Wes McKinney] Test JSON array roundtrip for numeric types, strings, lists, structs 82f108b [Wes McKinney] Refactoring. Array test scaffold 0891378 [Wes McKinney] Declare loop variables 6566343 [Wes McKinney] Recursively construct children for list/struct 35c2f85 [Wes McKinney] Refactoring. Start drafting string/list reader f26402a [Wes McKinney] Install type_traits.h. cpplint 4fc7294 [Wes McKinney] Refactoring, type attribute consistency. Array reader compiles 2c93cce [Wes McKinney] WIP JSON array reader code path 932ba7a [Wes McKinney] Add ArrayVisitor methods, add enough metaprogramming to detect presence of c_type type member 15c1094 [Wes McKinney] Add type traits, refactoring, drafting json array writing. not working yet 209ba48 [Wes McKinney] More types refactoring. Strange linker error in pyarrow 379da3c [Wes McKinney] Implement union metadata JSON serialization 5fbea41 [Wes McKinney] Implement some more json types and add convenience factory functions 1c08233 [Wes McKinney] JSON schema roundtrip passing for many types 86c9559 [Wes McKinney] Add convenience factory functions for common types 3b9d14e [Wes McKinney] Add type-specific JSON metadata to schema writer 820b0f2 [Wes McKinney] Drafting JSON schema read/write 68ee7ab [Wes McKinney] Move forward declarations into type_fwd.h 1edf2a9 [Wes McKinney] Prototyping out visitor pattern for json serialization 24c1d5d [Wes McKinney] Some Types refactoring, add TypeVisitor abstract class. Add RapidJSON as external project
1 parent 8417096 commit ed6ec3b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+3049
-450
lines changed

cpp/CMakeLists.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,25 @@ if(ARROW_BUILD_BENCHMARKS)
545545
endif()
546546
endif()
547547

548+
# RapidJSON, header only dependency
549+
if("$ENV{RAPIDJSON_HOME}" STREQUAL "")
550+
ExternalProject_Add(rapidjson_ep
551+
PREFIX "${CMAKE_BINARY_DIR}"
552+
URL "https://github.com/miloyip/rapidjson/archive/v1.1.0.tar.gz"
553+
URL_MD5 "badd12c511e081fec6c89c43a7027bce"
554+
CONFIGURE_COMMAND ""
555+
BUILD_COMMAND ""
556+
BUILD_IN_SOURCE 1
557+
INSTALL_COMMAND "")
558+
559+
ExternalProject_Get_Property(rapidjson_ep SOURCE_DIR)
560+
set(RAPIDJSON_INCLUDE_DIR "${SOURCE_DIR}/include")
561+
else()
562+
set(RAPIDJSON_INCLUDE_DIR "$ENV{RAPIDJSON_HOME}/include")
563+
endif()
564+
message(STATUS "RapidJSON include dir: ${RAPIDJSON_INCLUDE_DIR}")
565+
include_directories(SYSTEM ${RAPIDJSON_INCLUDE_DIR})
566+
548567
## Google PerfTools
549568
##
550569
## Disabled with TSAN/ASAN as well as with gold+dynamic linking (see comment

cpp/src/arrow/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ install(FILES
2424
schema.h
2525
table.h
2626
type.h
27+
type_fwd.h
28+
type_traits.h
2729
test-util.h
2830
DESTINATION include/arrow)
2931

cpp/src/arrow/array.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,24 @@
1818
#include "arrow/array.h"
1919

2020
#include <cstdint>
21+
#include <cstring>
2122

2223
#include "arrow/util/bit-util.h"
2324
#include "arrow/util/buffer.h"
2425
#include "arrow/util/status.h"
2526

2627
namespace arrow {
2728

29+
Status GetEmptyBitmap(
30+
MemoryPool* pool, int32_t length, std::shared_ptr<MutableBuffer>* result) {
31+
auto buffer = std::make_shared<PoolBuffer>(pool);
32+
RETURN_NOT_OK(buffer->Resize(BitUtil::BytesForBits(length)));
33+
memset(buffer->mutable_data(), 0, buffer->size());
34+
35+
*result = buffer;
36+
return Status::OK();
37+
}
38+
2839
// ----------------------------------------------------------------------
2940
// Base array class
3041

@@ -66,4 +77,8 @@ bool NullArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_st
6677
return true;
6778
}
6879

80+
Status NullArray::Accept(ArrayVisitor* visitor) const {
81+
return visitor->Visit(*this);
82+
}
83+
6984
} // namespace arrow

cpp/src/arrow/array.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
namespace arrow {
3030

3131
class Buffer;
32+
class MemoryPool;
33+
class MutableBuffer;
3234
class Status;
3335

3436
// Immutable data array with some logical type and some length. Any memory is
@@ -70,6 +72,8 @@ class ARROW_EXPORT Array {
7072
// returning Status::OK. This can be an expensive check.
7173
virtual Status Validate() const;
7274

75+
virtual Status Accept(ArrayVisitor* visitor) const = 0;
76+
7377
protected:
7478
std::shared_ptr<DataType> type_;
7579
int32_t null_count_;
@@ -86,6 +90,8 @@ class ARROW_EXPORT Array {
8690
// Degenerate null type Array
8791
class ARROW_EXPORT NullArray : public Array {
8892
public:
93+
using TypeClass = NullType;
94+
8995
NullArray(const std::shared_ptr<DataType>& type, int32_t length)
9096
: Array(type, length, length, nullptr) {}
9197

@@ -94,9 +100,15 @@ class ARROW_EXPORT NullArray : public Array {
94100
bool Equals(const std::shared_ptr<Array>& arr) const override;
95101
bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index,
96102
const std::shared_ptr<Array>& arr) const override;
103+
104+
Status Accept(ArrayVisitor* visitor) const override;
97105
};
98106

99107
typedef std::shared_ptr<Array> ArrayPtr;
108+
109+
Status ARROW_EXPORT GetEmptyBitmap(
110+
MemoryPool* pool, int32_t length, std::shared_ptr<MutableBuffer>* result);
111+
100112
} // namespace arrow
101113

102114
#endif

cpp/src/arrow/column-test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
#include "gtest/gtest.h"
2424

25+
#include "arrow/array.h"
2526
#include "arrow/column.h"
2627
#include "arrow/schema.h"
2728
#include "arrow/test-util.h"

cpp/src/arrow/io/hdfs.cc

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -289,13 +289,9 @@ class HdfsClient::HdfsClientImpl {
289289

290290
// connect to HDFS with the builder object
291291
hdfsBuilder* builder = hdfsNewBuilder();
292-
if (!config->host.empty()) {
293-
hdfsBuilderSetNameNode(builder, config->host.c_str());
294-
}
292+
if (!config->host.empty()) { hdfsBuilderSetNameNode(builder, config->host.c_str()); }
295293
hdfsBuilderSetNameNodePort(builder, config->port);
296-
if (!config->user.empty()) {
297-
hdfsBuilderSetUserName(builder, config->user.c_str());
298-
}
294+
if (!config->user.empty()) { hdfsBuilderSetUserName(builder, config->user.c_str()); }
299295
if (!config->kerb_ticket.empty()) {
300296
hdfsBuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str());
301297
}

cpp/src/arrow/io/libhdfs_shim.cc

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,9 @@ static HINSTANCE libjvm_handle = NULL;
7474
// NOTE(wesm): cpplint does not like use of short and other imprecise C types
7575

7676
static hdfsBuilder* (*ptr_hdfsNewBuilder)(void) = NULL;
77-
static void (*ptr_hdfsBuilderSetNameNode)(
78-
hdfsBuilder* bld, const char* nn) = NULL;
79-
static void (*ptr_hdfsBuilderSetNameNodePort)(
80-
hdfsBuilder* bld, tPort port) = NULL;
81-
static void (*ptr_hdfsBuilderSetUserName)(
82-
hdfsBuilder* bld, const char* userName) = NULL;
77+
static void (*ptr_hdfsBuilderSetNameNode)(hdfsBuilder* bld, const char* nn) = NULL;
78+
static void (*ptr_hdfsBuilderSetNameNodePort)(hdfsBuilder* bld, tPort port) = NULL;
79+
static void (*ptr_hdfsBuilderSetUserName)(hdfsBuilder* bld, const char* userName) = NULL;
8380
static void (*ptr_hdfsBuilderSetKerbTicketCachePath)(
8481
hdfsBuilder* bld, const char* kerbTicketCachePath) = NULL;
8582
static hdfsFS (*ptr_hdfsBuilderConnect)(hdfsBuilder* bld) = NULL;
@@ -173,9 +170,9 @@ void hdfsBuilderSetUserName(hdfsBuilder* bld, const char* userName) {
173170
ptr_hdfsBuilderSetUserName(bld, userName);
174171
}
175172

176-
void hdfsBuilderSetKerbTicketCachePath(hdfsBuilder* bld,
177-
const char* kerbTicketCachePath) {
178-
ptr_hdfsBuilderSetKerbTicketCachePath(bld , kerbTicketCachePath);
173+
void hdfsBuilderSetKerbTicketCachePath(
174+
hdfsBuilder* bld, const char* kerbTicketCachePath) {
175+
ptr_hdfsBuilderSetKerbTicketCachePath(bld, kerbTicketCachePath);
179176
}
180177

181178
hdfsFS hdfsBuilderConnect(hdfsBuilder* bld) {
@@ -364,7 +361,7 @@ static std::vector<fs::path> get_potential_libhdfs_paths() {
364361
std::vector<fs::path> libhdfs_potential_paths;
365362
std::string file_name;
366363

367-
// OS-specific file name
364+
// OS-specific file name
368365
#ifdef __WIN32
369366
file_name = "hdfs.dll";
370367
#elif __APPLE__
@@ -374,10 +371,7 @@ static std::vector<fs::path> get_potential_libhdfs_paths() {
374371
#endif
375372

376373
// Common paths
377-
std::vector<fs::path> search_paths = {
378-
fs::path(""),
379-
fs::path(".")
380-
};
374+
std::vector<fs::path> search_paths = {fs::path(""), fs::path(".")};
381375

382376
// Path from environment variable
383377
const char* hadoop_home = std::getenv("HADOOP_HOME");
@@ -387,9 +381,7 @@ static std::vector<fs::path> get_potential_libhdfs_paths() {
387381
}
388382

389383
const char* libhdfs_dir = std::getenv("ARROW_LIBHDFS_DIR");
390-
if (libhdfs_dir != nullptr) {
391-
search_paths.push_back(fs::path(libhdfs_dir));
392-
}
384+
if (libhdfs_dir != nullptr) { search_paths.push_back(fs::path(libhdfs_dir)); }
393385

394386
// All paths with file name
395387
for (auto& path : search_paths) {

cpp/src/arrow/ipc/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ set(ARROW_IPC_TEST_LINK_LIBS
3434
set(ARROW_IPC_SRCS
3535
adapter.cc
3636
file.cc
37+
json.cc
38+
json-internal.cc
3739
metadata.cc
3840
metadata-internal.cc
3941
)
@@ -79,6 +81,10 @@ ADD_ARROW_TEST(ipc-metadata-test)
7981
ARROW_TEST_LINK_LIBRARIES(ipc-metadata-test
8082
${ARROW_IPC_TEST_LINK_LIBS})
8183

84+
ADD_ARROW_TEST(ipc-json-test)
85+
ARROW_TEST_LINK_LIBRARIES(ipc-json-test
86+
${ARROW_IPC_TEST_LINK_LIBS})
87+
8288
# make clean will delete the generated file
8389
set_source_files_properties(Metadata_generated.h PROPERTIES GENERATED TRUE)
8490

@@ -114,6 +120,7 @@ add_dependencies(arrow_objlib metadata_fbs)
114120
install(FILES
115121
adapter.h
116122
file.h
123+
json.h
117124
metadata.h
118125
DESTINATION include/arrow/ipc)
119126

cpp/src/arrow/ipc/adapter.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ Status VisitArray(const Array* arr, std::vector<flatbuf::FieldNode>* field_nodes
106106
buffers->push_back(binary_arr->data());
107107
} else if (arr->type_enum() == Type::LIST) {
108108
const auto list_arr = static_cast<const ListArray*>(arr);
109-
buffers->push_back(list_arr->offset_buffer());
109+
buffers->push_back(list_arr->offsets());
110110
RETURN_NOT_OK(VisitArray(
111111
list_arr->values().get(), field_nodes, buffers, max_recursion_depth - 1));
112112
} else if (arr->type_enum() == Type::STRUCT) {

0 commit comments

Comments
 (0)