@@ -54,7 +54,7 @@ static const auto LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY =
5454 " \x04\x07\x08\x04 " sv;
5555
5656template <typename Generator>
57- auto build_vector (size_t items, Generator && gen) {
57+ auto GenerateVector (size_t items, Generator && gen) {
5858 std::vector<std::result_of_t <Generator (size_t )>> result;
5959 result.reserve (items);
6060 for (size_t i = 0 ; i < items; ++i) {
@@ -64,7 +64,7 @@ auto build_vector(size_t items, Generator && gen) {
6464 return result;
6565}
6666
67- std::string foobar (size_t i) {
67+ std::string FooBarSeq (size_t i) {
6868 std::string result;
6969 if (i % 3 == 0 )
7070 result += " Foo" ;
@@ -76,6 +76,34 @@ std::string foobar(size_t i) {
7676 return result;
7777}
7878
79+ template <typename T, typename U = T>
80+ auto SameValueSeq (const U & value) {
81+ return [&value](size_t ) -> T {
82+ return value;
83+ };
84+ }
85+
86+ template <typename ResultType, typename Generator1, typename Generator2>
87+ auto AlternateGenerators (Generator1 && gen1, Generator2 && gen2) {
88+ return [&gen1, &gen2](size_t i) -> ResultType {
89+ if (i % 2 == 0 )
90+ return gen1 (i/2 );
91+ else
92+ return gen2 (i/2 );
93+ };
94+ }
95+
96+ template <typename T>
97+ std::vector<T> ConcatSequences (std::vector<T> && vec1, std::vector<T> && vec2)
98+ {
99+ std::vector<T> result (vec1);
100+
101+ result.reserve (vec1.size () + vec2.size ());
102+ result.insert (result.end (), vec2.begin (), vec2.end ());
103+
104+ return result;
105+ }
106+
79107static std::vector<Int64> MakeDateTime64s () {
80108 static const auto seconds_multiplier = 1'000'000 ;
81109 static const auto year = 86400ull * 365 * seconds_multiplier; // ~approx, but this doesn't matter here.
@@ -85,7 +113,7 @@ static std::vector<Int64> MakeDateTime64s() {
85113 // Please note there are values outside of DateTime (32-bit) range that might
86114 // not have correct string representation in CH yet,
87115 // but still are supported as Int64 values.
88- return build_vector (200 ,
116+ return GenerateVector (200 ,
89117 [] (size_t i )-> Int64 {
90118 return (i - 100 ) * year * 2 + (i * 10 ) * seconds_multiplier + i;
91119 });
@@ -375,26 +403,26 @@ TEST(ColumnsCase, UUIDSlice) {
375403 ASSERT_EQ (sub->At (1 ), UInt128 (0x3507213c178649f9llu, 0x9faf035d662f60aellu));
376404}
377405
378- TEST (ColumnsCase, LowCardinalityWrapperString_Append_and_Read ) {
406+ TEST (ColumnsCase, ColumnLowCardinalityString_Append_and_Read ) {
379407 const size_t items_count = 11 ;
380408 ColumnLowCardinalityT<ColumnString> col;
381- for (const auto & item : build_vector (items_count, &foobar )) {
409+ for (const auto & item : GenerateVector (items_count, &FooBarSeq )) {
382410 col.Append (item);
383411 }
384412
385413 ASSERT_EQ (col.Size (), items_count);
386414 ASSERT_EQ (col.GetDictionarySize (), 8u + 1 ); // 8 unique items from sequence + 1 null-item
387415
388416 for (size_t i = 0 ; i < items_count; ++i) {
389- ASSERT_EQ (col.At (i), foobar (i)) << " at pos: " << i;
390- ASSERT_EQ (col[i], foobar (i)) << " at pos: " << i;
417+ ASSERT_EQ (col.At (i), FooBarSeq (i)) << " at pos: " << i;
418+ ASSERT_EQ (col[i], FooBarSeq (i)) << " at pos: " << i;
391419 }
392420}
393421
394- TEST (ColumnsCase, ColumnLowCardinalityT_String_Clear_and_Append ) {
422+ TEST (ColumnsCase, ColumnLowCardinalityString_Clear_and_Append ) {
395423 const size_t items_count = 11 ;
396424 ColumnLowCardinalityT<ColumnString> col;
397- for (const auto & item : build_vector (items_count, &foobar ))
425+ for (const auto & item : GenerateVector (items_count, &FooBarSeq ))
398426 {
399427 col.Append (item);
400428 }
@@ -403,7 +431,7 @@ TEST(ColumnsCase, ColumnLowCardinalityT_String_Clear_and_Append) {
403431 ASSERT_EQ (col.Size (), 0u );
404432 ASSERT_EQ (col.GetDictionarySize (), 1u ); // null-item
405433
406- for (const auto & item : build_vector (items_count, &foobar ))
434+ for (const auto & item : GenerateVector (items_count, &FooBarSeq ))
407435 {
408436 col.Append (item);
409437 }
@@ -412,7 +440,7 @@ TEST(ColumnsCase, ColumnLowCardinalityT_String_Clear_and_Append) {
412440 ASSERT_EQ (col.GetDictionarySize (), 8u + 1 ); // 8 unique items from sequence + 1 null-item
413441}
414442
415- TEST (ColumnsCase, LowCardinalityString_Load ) {
443+ TEST (ColumnsCase, ColumnLowCardinalityString_Load ) {
416444 const size_t items_count = 10 ;
417445 ColumnLowCardinalityT<ColumnString> col;
418446
@@ -423,25 +451,116 @@ TEST(ColumnsCase, LowCardinalityString_Load) {
423451 EXPECT_TRUE (col.Load (&stream, items_count));
424452
425453 for (size_t i = 0 ; i < items_count; ++i) {
426- EXPECT_EQ (col.At (i), foobar (i)) << " at pos: " << i;
454+ EXPECT_EQ (col.At (i), FooBarSeq (i)) << " at pos: " << i;
427455 }
428456}
429457
430- TEST (ColumnsCase, LowCardinalityString_Save) {
458+ // This is temporary diabled since we are not 100% compatitable with ClickHouse
459+ // on how we serailize LC columns, but we check interoperability in other tests (see client_ut.cpp)
460+ TEST (ColumnsCase, DISABLED_ColumnLowCardinalityString_Save) {
431461 const size_t items_count = 10 ;
432462 ColumnLowCardinalityT<ColumnString> col;
433- for (const auto & item : build_vector (items_count, &foobar )) {
463+ for (const auto & item : GenerateVector (items_count, &FooBarSeq )) {
434464 col.Append (item);
435465 }
436466
437- const auto & data = LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY;
438- ArrayInput buffer (data.data (), data.size ());
439- CodedInputStream stream (&buffer);
467+ ArrayOutput output (0 , 0 );
468+ CodedOutputStream output_stream (&output);
440469
441- EXPECT_TRUE (col.Load (&stream, items_count));
470+ const size_t expected_output_size = LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY.size ();
471+ // Enough space to account for possible overflow from both right and left sides.
472+ char buffer[expected_output_size * 10 ] = {' \0 ' };
473+ const char margin_content[sizeof (buffer)] = {' \0 ' };
442474
443- for (size_t i = 0 ; i < items_count; ++i) {
444- EXPECT_EQ (col.At (i), foobar (i)) << " at pos: " << i;
475+ const size_t left_margin_size = 10 ;
476+ const size_t right_margin_size = sizeof (buffer) - left_margin_size - expected_output_size;
477+
478+ // Since overflow from left side is less likely to happen, leave only tiny margin there.
479+ auto write_pos = buffer + left_margin_size;
480+ const auto left_margin = buffer;
481+ const auto right_margin = write_pos + expected_output_size;
482+
483+ output.Reset (write_pos, expected_output_size);
484+
485+ EXPECT_NO_THROW (col.Save (&output_stream));
486+
487+ // Left margin should be blank
488+ EXPECT_EQ (std::string_view (margin_content, left_margin_size), std::string_view (left_margin, left_margin_size));
489+ // Right margin should be blank too
490+ EXPECT_EQ (std::string_view (margin_content, right_margin_size), std::string_view (right_margin, right_margin_size));
491+
492+ // TODO: right now LC columns do not write indexes in the most compact way possible, so binary representation is a bit different
493+ // (there might be other inconsistances too)
494+ EXPECT_EQ (LOWCARDINALITY_STRING_FOOBAR_10_ITEMS_BINARY, std::string_view (write_pos, expected_output_size));
495+ }
496+
497+ TEST (ColumnsCase, ColumnLowCardinalityString_SaveAndLoad) {
498+ // Verify that we can load binary representation back
499+ ColumnLowCardinalityT<ColumnString> col;
500+
501+ const auto items = GenerateVector (10 , &FooBarSeq);
502+ for (const auto & item : items) {
503+ col.Append (item);
504+ }
505+
506+ char buffer[256 ] = {' \0 ' }; // about 3 times more space than needed for this set of values.
507+ {
508+ ArrayOutput output (buffer, sizeof (buffer));
509+ CodedOutputStream output_stream (&output);
510+ EXPECT_NO_THROW (col.Save (&output_stream));
511+ }
512+
513+ col.Clear ();
514+
515+ {
516+ // Load the data back
517+ ArrayInput input (buffer, sizeof (buffer));
518+ CodedInputStream input_stream (&input);
519+ EXPECT_TRUE (col.Load (&input_stream, items.size ()));
520+ }
521+
522+ for (size_t i = 0 ; i < items.size (); ++i) {
523+ EXPECT_EQ (col.At (i), items[i]) << " at pos: " << i;
524+ }
525+ }
526+
527+ TEST (ColumnsCase, ColumnLowCardinalityString_WithEmptyString_1) {
528+ // Verify that when empty string is added to a LC column it can be retrieved back as empty string.
529+ ColumnLowCardinalityT<ColumnString> col;
530+ const auto values = GenerateVector (10 , AlternateGenerators<std::string>(SameValueSeq<std::string>(" " ), FooBarSeq));
531+ for (const auto & item : values) {
532+ col.Append (item);
533+ }
534+
535+ for (size_t i = 0 ; i < values.size (); ++i) {
536+ EXPECT_EQ (values[i], col.At (i)) << " at pos: " << i;
537+ }
538+ }
539+
540+ TEST (ColumnsCase, ColumnLowCardinalityString_WithEmptyString_2) {
541+ // Verify that when empty string is added to a LC column it can be retrieved back as empty string.
542+ // (Ver2): Make sure that outcome doesn't depend if empty values are on odd positions
543+ ColumnLowCardinalityT<ColumnString> col;
544+ const auto values = GenerateVector (10 , AlternateGenerators<std::string>(FooBarSeq, SameValueSeq<std::string>(" " )));
545+ for (const auto & item : values) {
546+ col.Append (item);
547+ }
548+
549+ for (size_t i = 0 ; i < values.size (); ++i) {
550+ EXPECT_EQ (values[i], col.At (i)) << " at pos: " << i;
551+ }
552+ }
553+
554+ TEST (ColumnsCase, ColumnLowCardinalityString_WithEmptyString_3) {
555+ // When we have many leading empty strings and some non-empty values.
556+ ColumnLowCardinalityT<ColumnString> col;
557+ const auto values = ConcatSequences (GenerateVector (100 , SameValueSeq<std::string>(" " )), GenerateVector (5 , FooBarSeq));
558+ for (const auto & item : values) {
559+ col.Append (item);
560+ }
561+
562+ for (size_t i = 0 ; i < values.size (); ++i) {
563+ EXPECT_EQ (values[i], col.At (i)) << " at pos: " << i;
445564 }
446565}
447566
0 commit comments