Skip to content

Commit

Permalink
Merge branch 'branch-24.06' into fea-developer-guide-resource-ref
Browse files Browse the repository at this point in the history
  • Loading branch information
harrism authored Apr 29, 2024
2 parents 2f959b7 + 064dd7b commit ad4b562
Show file tree
Hide file tree
Showing 46 changed files with 1,760 additions and 512 deletions.
4 changes: 2 additions & 2 deletions cpp/benchmarks/string/contains.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
}

// longer pattern lengths demand more working memory per string
std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"};
std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"};

static void bench_contains(nvbench::state& state)
{
Expand Down Expand Up @@ -114,4 +114,4 @@ NVBENCH_BENCH(bench_contains)
.add_int64_axis("row_width", {32, 64, 128, 256, 512})
.add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
.add_int64_axis("hit_rate", {50, 100}) // percentage
.add_int64_axis("pattern", {0, 1});
.add_int64_axis("pattern", {0, 1, 2});
12 changes: 8 additions & 4 deletions cpp/benchmarks/string/count.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@

#include <nvbench/nvbench.cuh>

static std::string patterns[] = {"\\d+", "a"};

static void bench_count(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));

if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
Expand All @@ -41,7 +44,7 @@ static void bench_count(nvbench::state& state)
create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
cudf::strings_column_view input(table->view().column(0));

std::string pattern = "\\d+";
auto const pattern = patterns[pattern_index];

auto prog = cudf::strings::regex_program::create(pattern);

Expand All @@ -59,4 +62,5 @@ static void bench_count(nvbench::state& state)
NVBENCH_BENCH(bench_count)
.set_name("count")
.add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
.add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
.add_int64_axis("pattern", {0, 1});
1 change: 1 addition & 0 deletions cpp/include/cudf/io/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ enum class column_encoding {
///< valid for BYTE_ARRAY columns)
DELTA_BYTE_ARRAY, ///< Use DELTA_BYTE_ARRAY encoding (only valid for
///< BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY columns)
BYTE_STREAM_SPLIT, ///< Use BYTE_STREAM_SPLIT encoding (valid for all fixed width types)
// ORC encodings:
DIRECT, ///< Use DIRECT encoding
DIRECT_V2, ///< Use DIRECT_V2 encoding
Expand Down
37 changes: 27 additions & 10 deletions cpp/include/cudf_test/testing_main.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,25 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
}
}

/**
* @brief Sets up stream mode memory resource adaptor
*
* The resource adaptor is only set as the current device resource if the
* stream mode is enabled.
*
* The caller must keep the return object alive for the life of the test runs.
*
* @param cmd_opts Command line options returned by parse_cudf_test_opts
* @return Memory resource adaptor
*/
inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
{
auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
auto resource = cudf::test::create_memory_resource(rmm_mode);
rmm::mr::set_current_device_resource(resource.get());
return resource;
}

/**
* @brief Sets up stream mode memory resource adaptor
*
Expand Down Expand Up @@ -181,14 +200,12 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
* function parses the command line to customize test behavior, like the
* allocation mode used for creating the default memory resource.
*/
#define CUDF_TEST_PROGRAM_MAIN() \
int main(int argc, char** argv) \
{ \
::testing::InitGoogleTest(&argc, argv); \
auto const cmd_opts = parse_cudf_test_opts(argc, argv); \
auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>(); \
auto resource = cudf::test::create_memory_resource(rmm_mode); \
rmm::mr::set_current_device_resource(resource.get()); \
auto adaptor = make_stream_mode_adaptor(cmd_opts); \
return RUN_ALL_TESTS(); \
#define CUDF_TEST_PROGRAM_MAIN() \
int main(int argc, char** argv) \
{ \
::testing::InitGoogleTest(&argc, argv); \
auto const cmd_opts = parse_cudf_test_opts(argc, argv); \
[[maybe_unused]] auto mr = make_memory_resource_adaptor(cmd_opts); \
[[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts); \
return RUN_ALL_TESTS(); \
}
18 changes: 17 additions & 1 deletion cpp/src/io/parquet/compact_protocol_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "compact_protocol_reader.hpp"

#include "parquet.hpp"
#include "parquet_common.hpp"

#include <cudf/utilities/error.hpp>

Expand Down Expand Up @@ -652,6 +653,9 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
{
using optional_size_statistics =
parquet_field_optional<SizeStatistics, parquet_field_struct<SizeStatistics>>;
using optional_list_enc_stats =
parquet_field_optional<std::vector<PageEncodingStats>,
parquet_field_struct_list<PageEncodingStats>>;
auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
parquet_field_enum_list(2, c->encodings),
parquet_field_string_list(3, c->path_in_schema),
Expand All @@ -663,6 +667,7 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
parquet_field_int64(10, c->index_page_offset),
parquet_field_int64(11, c->dictionary_page_offset),
parquet_field_struct(12, c->statistics),
optional_list_enc_stats(13, c->encoding_stats),
optional_size_statistics(16, c->size_statistics));
function_builder(this, op);
}
Expand Down Expand Up @@ -758,13 +763,16 @@ void CompactProtocolReader::read(Statistics* s)
{
using optional_binary = parquet_field_optional<std::vector<uint8_t>, parquet_field_binary>;
using optional_int64 = parquet_field_optional<int64_t, parquet_field_int64>;
using optional_bool = parquet_field_optional<bool, parquet_field_bool>;

auto op = std::make_tuple(optional_binary(1, s->max),
optional_binary(2, s->min),
optional_int64(3, s->null_count),
optional_int64(4, s->distinct_count),
optional_binary(5, s->max_value),
optional_binary(6, s->min_value));
optional_binary(6, s->min_value),
optional_bool(7, s->is_max_value_exact),
optional_bool(8, s->is_min_value_exact));
function_builder(this, op);
}

Expand All @@ -774,6 +782,14 @@ void CompactProtocolReader::read(ColumnOrder* c)
function_builder(this, op);
}

void CompactProtocolReader::read(PageEncodingStats* s)
{
auto op = std::make_tuple(parquet_field_enum<PageType>(1, s->page_type),
parquet_field_enum<Encoding>(2, s->encoding),
parquet_field_int32(3, s->count));
function_builder(this, op);
}

void CompactProtocolReader::read(SortingColumn* s)
{
auto op = std::make_tuple(parquet_field_int32(1, s->column_idx),
Expand Down
1 change: 1 addition & 0 deletions cpp/src/io/parquet/compact_protocol_reader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ class CompactProtocolReader {
void read(ColumnIndex* c);
void read(Statistics* s);
void read(ColumnOrder* c);
void read(PageEncodingStats* s);
void read(SortingColumn* s);

public:
Expand Down
12 changes: 12 additions & 0 deletions cpp/src/io/parquet/compact_protocol_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
if (s.index_page_offset != 0) { c.field_int(10, s.index_page_offset); }
if (s.dictionary_page_offset != 0) { c.field_int(11, s.dictionary_page_offset); }
c.field_struct(12, s.statistics);
if (s.encoding_stats.has_value()) { c.field_struct_list(13, s.encoding_stats.value()); }
if (s.size_statistics.has_value()) { c.field_struct(16, s.size_statistics.value()); }
return c.value();
}
Expand All @@ -201,6 +202,8 @@ size_t CompactProtocolWriter::write(Statistics const& s)
if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); }
if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); }
if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); }
if (s.is_max_value_exact.has_value()) { c.field_bool(7, s.is_max_value_exact.value()); }
if (s.is_min_value_exact.has_value()) { c.field_bool(8, s.is_min_value_exact.value()); }
return c.value();
}

Expand Down Expand Up @@ -248,6 +251,15 @@ size_t CompactProtocolWriter::write(ColumnOrder const& co)
return c.value();
}

size_t CompactProtocolWriter::write(PageEncodingStats const& enc)
{
CompactProtocolFieldWriter c(*this);
c.field_int(1, static_cast<int32_t>(enc.page_type));
c.field_int(2, static_cast<int32_t>(enc.encoding));
c.field_int(3, enc.count);
return c.value();
}

size_t CompactProtocolWriter::write(SortingColumn const& sc)
{
CompactProtocolFieldWriter c(*this);
Expand Down
1 change: 1 addition & 0 deletions cpp/src/io/parquet/compact_protocol_writer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class CompactProtocolWriter {
size_t write(OffsetIndex const&);
size_t write(SizeStatistics const&);
size_t write(ColumnOrder const&);
size_t write(PageEncodingStats const&);
size_t write(SortingColumn const&);

protected:
Expand Down
Loading

0 comments on commit ad4b562

Please sign in to comment.