Skip to content

Commit b4f016b

Browse files
authored
Merge pull request #182 from arthurpassos/low_cardinality_nullable_fix
Implement LC of Nullable
2 parents b21883c + 8d9ac3b commit b4f016b

File tree

8 files changed

+249
-30
lines changed

8 files changed

+249
-30
lines changed

clickhouse/client.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,12 @@ void Client::Impl::WriteBlock(const Block& block, OutputStream& output) {
653653
WireFormat::WriteString(output, bi.Name());
654654
WireFormat::WriteString(output, bi.Type()->GetName());
655655

656-
bi.Column()->Save(&output);
656+
// Empty columns are not serialized and occupy exactly 0 bytes.
657+
// ref https://github.com/ClickHouse/ClickHouse/blob/39b37a3240f74f4871c8c1679910e065af6bea19/src/Formats/NativeWriter.cpp#L163
658+
const bool containsData = block.GetRowCount() > 0;
659+
if (containsData) {
660+
bi.Column()->Save(&output);
661+
}
657662
}
658663
output.Flush();
659664
}

clickhouse/client.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ struct ClientOptions {
8686
// TCP options
8787
DECLARE_FIELD(tcp_nodelay, bool, TcpNoDelay, true);
8888

89+
// TODO deprecate setting
8990
/** It helps to ease migration of the old codebases, which can't afford to switch
9091
* to using ColumnLowCardinalityT or ColumnLowCardinality directly,
9192
* but still want to benefit from smaller on-wire LowCardinality bandwidth footprint.

clickhouse/columns/factory.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,8 @@ static ColumnRef CreateColumnFromAst(const TypeAst& ast, CreateColumnByTypeSetti
163163
return std::make_shared<LowCardinalitySerializationAdaptor<ColumnString>>();
164164
case Type::FixedString:
165165
return std::make_shared<LowCardinalitySerializationAdaptor<ColumnFixedString>>(nested.elements.front().value);
166+
case Type::Nullable:
167+
throw UnimplementedError("LowCardinality(" + nested.name + ") is not supported with LowCardinalityAsWrappedColumn on");
166168
default:
167169
throw UnimplementedError("LowCardinality(" + nested.name + ") is not supported");
168170
}
@@ -174,6 +176,13 @@ static ColumnRef CreateColumnFromAst(const TypeAst& ast, CreateColumnByTypeSetti
174176
return std::make_shared<ColumnLowCardinalityT<ColumnString>>();
175177
case Type::FixedString:
176178
return std::make_shared<ColumnLowCardinalityT<ColumnFixedString>>(nested.elements.front().value);
179+
case Type::Nullable:
180+
return std::make_shared<ColumnLowCardinality>(
181+
std::make_shared<ColumnNullable>(
182+
CreateColumnFromAst(nested.elements.front(), settings),
183+
std::make_shared<ColumnUInt8>()
184+
)
185+
);
177186
default:
178187
throw UnimplementedError("LowCardinality(" + nested.name + ") is not supported");
179188
}

clickhouse/columns/lowcardinality.cpp

Lines changed: 105 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,47 @@ inline auto VisitIndexColumn(Vizitor && vizitor, ColumnType && col) {
9494
}
9595
}
9696

97+
// A special NULL-item, which is expected at pos(0) in dictionary,
98+
// note that we distinguish empty string from NULL-value.
99+
inline auto GetNullItemForDictionary(const ColumnRef dictionary) {
100+
if (auto n = dictionary->As<ColumnNullable>()) {
101+
return ItemView {};
102+
} else {
103+
return ItemView{dictionary->Type()->GetCode(), std::string_view{}};
104+
}
105+
}
106+
107+
// A special default item, which is expected at pos(0) in dictionary,
108+
// note that we distinguish empty string from NULL-value.
109+
inline ItemView GetDefaultItemForDictionary(const ColumnRef dictionary) {
110+
if (auto n = dictionary->As<ColumnNullable>()) {
111+
return GetDefaultItemForDictionary(n->Nested());
112+
} else {
113+
return ItemView{dictionary->Type()->GetCode(), std::string_view{}};
114+
}
115+
}
116+
117+
void AppendToDictionary(Column& dictionary, const ItemView & item);
118+
119+
inline void AppendNullableToDictionary(ColumnNullable& nullable, const ItemView & item) {
120+
auto nested = nullable.Nested();
121+
122+
const bool isNullValue = item.type == Type::Void;
123+
124+
if (isNullValue) {
125+
AppendToDictionary(*nested, GetNullItemForDictionary(nested));
126+
} else {
127+
const auto nestedType = nested->GetType().GetCode();
128+
if (nestedType != item.type) {
129+
throw ValidationError("Invalid value. Type expected: " + nested->GetType().GetName());
130+
}
131+
132+
AppendToDictionary(*nested, item);
133+
}
134+
135+
nullable.Append(isNullValue);
136+
}
137+
97138
inline void AppendToDictionary(Column& dictionary, const ItemView & item) {
98139
switch (dictionary.GetType().GetCode()) {
99140
case Type::FixedString:
@@ -102,21 +143,14 @@ inline void AppendToDictionary(Column& dictionary, const ItemView & item) {
102143
case Type::String:
103144
column_down_cast<ColumnString>(dictionary).Append(item.get<std::string_view>());
104145
return;
146+
case Type::Nullable:
147+
AppendNullableToDictionary(column_down_cast<ColumnNullable>(dictionary), item);
148+
return;
105149
default:
106150
throw ValidationError("Unexpected dictionary column type: " + dictionary.GetType().GetName());
107151
}
108152
}
109153

110-
// A special NULL-item, which is expected at pos(0) in dictionary,
111-
// note that we distinguish empty string from NULL-value.
112-
inline auto GetNullItemForDictionary(const ColumnRef dictionary) {
113-
if (auto n = dictionary->As<ColumnNullable>()) {
114-
return ItemView{};
115-
} else {
116-
return ItemView{dictionary->Type()->GetCode(), std::string_view{}};
117-
}
118-
}
119-
120154
}
121155

122156
namespace clickhouse {
@@ -125,7 +159,23 @@ ColumnLowCardinality::ColumnLowCardinality(ColumnRef dictionary_column)
125159
dictionary_column_(dictionary_column->CloneEmpty()), // safe way to get an column of the same type.
126160
index_column_(std::make_shared<ColumnUInt32>())
127161
{
128-
AppendNullItemToEmptyColumn();
162+
Setup(dictionary_column);
163+
}
164+
165+
ColumnLowCardinality::ColumnLowCardinality(std::shared_ptr<ColumnNullable> dictionary_column)
166+
: Column(Type::CreateLowCardinality(dictionary_column->Type())),
167+
dictionary_column_(dictionary_column->CloneEmpty()), // safe way to get an column of the same type.
168+
index_column_(std::make_shared<ColumnUInt32>())
169+
{
170+
AppendNullItem();
171+
Setup(dictionary_column);
172+
}
173+
174+
ColumnLowCardinality::~ColumnLowCardinality()
175+
{}
176+
177+
void ColumnLowCardinality::Setup(ColumnRef dictionary_column) {
178+
AppendDefaultItem();
129179

130180
if (dictionary_column->Size() != 0) {
131181
// Add values, updating index_column_ and unique_items_map_.
@@ -140,9 +190,6 @@ ColumnLowCardinality::ColumnLowCardinality(ColumnRef dictionary_column)
140190
}
141191
}
142192

143-
ColumnLowCardinality::~ColumnLowCardinality()
144-
{}
145-
146193
std::uint64_t ColumnLowCardinality::getDictionaryIndex(std::uint64_t item_index) const {
147194
return VisitIndexColumn([item_index](const auto & arg) -> std::uint64_t {
148195
return arg[item_index];
@@ -215,7 +262,12 @@ auto Load(ColumnRef new_dictionary_column, InputStream& input, size_t rows) {
215262
if (!WireFormat::ReadFixed(input, &number_of_keys))
216263
throw ProtocolError("Failed to read number of rows in dictionary column.");
217264

218-
if (!new_dictionary_column->LoadBody(&input, number_of_keys))
265+
auto dataColumn = new_dictionary_column;
266+
if (auto nullable = new_dictionary_column->As<ColumnNullable>()) {
267+
dataColumn = nullable->Nested();
268+
}
269+
270+
if (!dataColumn->LoadBody(&input, number_of_keys))
219271
throw ProtocolError("Failed to read values of dictionary column.");
220272

221273
uint64_t number_of_rows;
@@ -227,8 +279,15 @@ auto Load(ColumnRef new_dictionary_column, InputStream& input, size_t rows) {
227279

228280
new_index_column->LoadBody(&input, number_of_rows);
229281

282+
if (auto nullable = new_dictionary_column->As<ColumnNullable>()) {
283+
nullable->Append(true);
284+
for(std::size_t i = 1; i < new_index_column->Size(); i++) {
285+
nullable->Append(false);
286+
}
287+
}
288+
230289
ColumnLowCardinality::UniqueItems new_unique_items_map;
231-
for (size_t i = 0; i < new_dictionary_column->Size(); ++i) {
290+
for (size_t i = 0; i < dataColumn->Size(); ++i) {
232291
const auto key = ColumnLowCardinality::computeHashKey(new_dictionary_column->GetItem(i));
233292
new_unique_items_map.emplace(key, i);
234293
}
@@ -278,10 +337,16 @@ void ColumnLowCardinality::SaveBody(OutputStream* output) {
278337

279338
const uint64_t number_of_keys = dictionary_column_->Size();
280339
WireFormat::WriteFixed(*output, number_of_keys);
281-
dictionary_column_->SaveBody(output);
340+
341+
if (auto columnNullable = dictionary_column_->As<ColumnNullable>()) {
342+
columnNullable->Nested()->SaveBody(output);
343+
} else {
344+
dictionary_column_->SaveBody(output);
345+
}
282346

283347
const uint64_t number_of_rows = index_column_->Size();
284348
WireFormat::WriteFixed(*output, number_of_rows);
349+
285350
index_column_->SaveBody(output);
286351
}
287352

@@ -290,7 +355,10 @@ void ColumnLowCardinality::Clear() {
290355
dictionary_column_->Clear();
291356
unique_items_map_.clear();
292357

293-
AppendNullItemToEmptyColumn();
358+
if (auto columnNullable = dictionary_column_->As<ColumnNullable>()) {
359+
AppendNullItem();
360+
}
361+
AppendDefaultItem();
294362
}
295363

296364
size_t ColumnLowCardinality::Size() const {
@@ -328,7 +396,17 @@ void ColumnLowCardinality::Swap(Column& other) {
328396
}
329397

330398
ItemView ColumnLowCardinality::GetItem(size_t index) const {
331-
return dictionary_column_->GetItem(getDictionaryIndex(index));
399+
const auto dictionaryIndex = getDictionaryIndex(index);
400+
401+
if (auto nullable = dictionary_column_->As<ColumnNullable>()) {
402+
const auto isNull = dictionaryIndex == 0u;
403+
404+
if (isNull) {
405+
return GetNullItemForDictionary(nullable);
406+
}
407+
}
408+
409+
return dictionary_column_->GetItem(dictionaryIndex);
332410
}
333411

334412
// No checks regarding value type or validity of value is made.
@@ -359,19 +437,20 @@ void ColumnLowCardinality::AppendUnsafe(const ItemView & value) {
359437
}
360438
}
361439

362-
void ColumnLowCardinality::AppendNullItemToEmptyColumn()
440+
void ColumnLowCardinality::AppendNullItem()
363441
{
364-
// INVARIANT: Empty LC column has an (invisible) null-item at pos 0, which MUST be present in
365-
// unique_items_map_ in order to reuse dictionary posistion on subsequent Append()-s.
366-
367-
// Should be only performed on empty LC column.
368-
assert(dictionary_column_->Size() == 0 && unique_items_map_.empty());
369-
370442
const auto null_item = GetNullItemForDictionary(dictionary_column_);
371443
AppendToDictionary(*dictionary_column_, null_item);
372444
unique_items_map_.emplace(computeHashKey(null_item), 0);
373445
}
374446

447+
void ColumnLowCardinality::AppendDefaultItem()
448+
{
449+
const auto defaultItem = GetDefaultItemForDictionary(dictionary_column_);
450+
unique_items_map_.emplace(computeHashKey(defaultItem), dictionary_column_->Size());
451+
AppendToDictionary(*dictionary_column_, defaultItem);
452+
}
453+
375454
size_t ColumnLowCardinality::GetDictionarySize() const {
376455
return dictionary_column_->Size();
377456
}

clickhouse/columns/lowcardinality.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "column.h"
44
#include "numeric.h"
5+
#include "nullable.h"
56

67
#include <functional>
78
#include <string>
@@ -32,6 +33,11 @@ struct LowCardinalityHashKeyHash {
3233

3334
}
3435

36+
/*
37+
* LC column contains an "invisible" default item at the beginning of the collection. [default, ...]
38+
* If the nested type is Nullable, it contains a null-item at the beginning and a default item at the second position. [null, default, ...]
39+
* Null map is not serialized in LC columns. Instead, nulls are tracked by having an index of 0.
40+
* */
3541
class ColumnLowCardinality : public Column {
3642
public:
3743
using UniqueItems = std::unordered_map<details::LowCardinalityHashKey, size_t /*dictionary index*/, details::LowCardinalityHashKeyHash>;
@@ -49,6 +55,7 @@ class ColumnLowCardinality : public Column {
4955
public:
5056
// c-tor makes a deep copy of the dictionary_column.
5157
explicit ColumnLowCardinality(ColumnRef dictionary_column);
58+
explicit ColumnLowCardinality(std::shared_ptr<ColumnNullable> dictionary_column);
5259
~ColumnLowCardinality();
5360

5461
/// Appends another LowCardinality column to the end of this one, updating dictionary.
@@ -84,12 +91,14 @@ class ColumnLowCardinality : public Column {
8491
std::uint64_t getDictionaryIndex(std::uint64_t item_index) const;
8592
void appendIndex(std::uint64_t item_index);
8693
void removeLastIndex();
87-
8894
ColumnRef GetDictionary();
95+
8996
void AppendUnsafe(const ItemView &);
9097

9198
private:
92-
void AppendNullItemToEmptyColumn();
99+
void Setup(ColumnRef dictionary_column);
100+
void AppendNullItem();
101+
void AppendDefaultItem();
93102

94103
public:
95104
static details::LowCardinalityHashKey computeHashKey(const ItemView &);

clickhouse/columns/nullable.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ void ColumnNullable::SaveBody(OutputStream* output) {
7474
}
7575

7676
size_t ColumnNullable::Size() const {
77-
assert(nested_->Size() == nulls_->Size());
7877
return nulls_->Size();
7978
}
8079

ut/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ SET ( clickhouse-cpp-ut-src
2222

2323
utils.cpp
2424
value_generators.cpp
25+
low_cardinality_nullable_tests.cpp
2526
)
2627

2728
IF (WITH_OPENSSL)

0 commit comments

Comments
 (0)