Skip to content

Commit e7c573f

Browse files
Merge different schemas (#13192)
1 parent 96abd52 commit e7c573f

File tree

23 files changed

+356
-81
lines changed

23 files changed

+356
-81
lines changed

ydb/core/formats/arrow/process_columns.cpp

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,27 @@ TConclusion<std::shared_ptr<TDataContainer>> ReorderImpl(
138138

139139
} // namespace
140140

141+
TOrderedColumnIndexesImpl::TOrderedColumnIndexesImpl(const std::vector<ui32>& columnIndexes)
142+
: ColumnIndexes(columnIndexes) {
143+
for (ui32 i = 0; i + 1 < columnIndexes.size(); ++i) {
144+
AFL_VERIFY(ColumnIndexes[i] < ColumnIndexes[i + 1]);
145+
}
146+
}
147+
148+
TOrderedColumnIndexesImpl::TOrderedColumnIndexesImpl(std::vector<ui32>&& columnIndexes)
149+
: ColumnIndexes(std::move(columnIndexes)) {
150+
for (ui32 i = 0; i + 1 < ColumnIndexes.size(); ++i) {
151+
AFL_VERIFY(ColumnIndexes[i] < ColumnIndexes[i + 1]);
152+
}
153+
}
154+
155+
TOrderedColumnIndexesImpl::TOrderedColumnIndexesImpl(const ui32 columnsCount) {
156+
ColumnIndexes.reserve(columnsCount);
157+
for (ui32 i = 0; i < columnsCount; ++i) {
158+
ColumnIndexes.emplace_back(i);
159+
}
160+
}
161+
141162
std::shared_ptr<arrow::RecordBatch> TColumnOperator::Extract(
142163
const std::shared_ptr<arrow::RecordBatch>& incoming, const std::vector<std::string>& columnNames) {
143164
return ExtractImpl(AbsentColumnPolicy, incoming, columnNames);
@@ -259,7 +280,7 @@ TConclusion<TSchemaSubset> TColumnOperator::BuildSequentialSubset(
259280
}
260281
namespace {
261282
template <class TDataContainer>
262-
TConclusion<std::shared_ptr<TDataContainer>> AdaptIncomingToDestinationExtImpl(const std::shared_ptr<TDataContainer>& incoming,
283+
TConclusion<TContainerWithIndexes<TDataContainer>> AdaptIncomingToDestinationExtImpl(const std::shared_ptr<TDataContainer>& incoming,
263284
const TSchemaLiteView& dstSchema, const std::function<TConclusionStatus(const ui32, const i32)>& checker,
264285
const std::function<i32(const std::string&)>& nameResolver, const TColumnOperator::ECheckFieldTypesPolicy differentColumnTypesPolicy,
265286
const TColumnOperator::EAbsentFieldPolicy absentColumnPolicy) {
@@ -318,14 +339,17 @@ TConclusion<std::shared_ptr<TDataContainer>> AdaptIncomingToDestinationExtImpl(c
318339
std::vector<std::shared_ptr<typename NAdapter::TDataBuilderPolicy<TDataContainer>::TColumn>> columns;
319340
columns.reserve(resultColumns.size());
320341
fields.reserve(resultColumns.size());
342+
std::vector<ui32> indexes;
321343
for (auto&& i : resultColumns) {
322344
fields.emplace_back(dstSchema.field(i.Index));
323345
columns.emplace_back(i.Column);
346+
indexes.emplace_back(i.Index);
324347
}
325-
return NAdapter::TDataBuilderPolicy<TDataContainer>::Build(std::make_shared<arrow::Schema>(fields), std::move(columns), incoming->num_rows());
348+
return TContainerWithIndexes<TDataContainer>(indexes,
349+
NAdapter::TDataBuilderPolicy<TDataContainer>::Build(std::make_shared<arrow::Schema>(fields), std::move(columns), incoming->num_rows()));
326350
}
327351
} // namespace
328-
TConclusion<std::shared_ptr<arrow::RecordBatch>> TColumnOperator::AdaptIncomingToDestinationExt(
352+
TConclusion<TContainerWithIndexes<arrow::RecordBatch>> TColumnOperator::AdaptIncomingToDestinationExt(
329353
const std::shared_ptr<arrow::RecordBatch>& incoming, const TSchemaLiteView& dstSchema,
330354
const std::function<TConclusionStatus(const ui32, const i32)>& checker, const std::function<i32(const std::string&)>& nameResolver) const {
331355
return AdaptIncomingToDestinationExtImpl(incoming, dstSchema, checker, nameResolver, DifferentColumnTypesPolicy, AbsentColumnPolicy);

ydb/core/formats/arrow/process_columns.h

Lines changed: 111 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
#pragma once
2+
#include <ydb/library/accessor/accessor.h>
3+
#include <ydb/library/accessor/validator_simple.h>
24
#include <ydb/library/conclusion/result.h>
35

46
#include <contrib/libs/apache/arrow/cpp/src/arrow/type.h>
7+
58
#include <functional>
69

710
namespace NKikimr::NArrow {
@@ -10,6 +13,113 @@ class TSchemaSubset;
1013
class TSchemaLite;
1114
class TSchemaLiteView;
1215

16+
class TOrderedColumnIndexesImpl {
17+
private:
18+
YDB_READONLY_DEF(std::vector<ui32>, ColumnIndexes);
19+
20+
public:
21+
TOrderedColumnIndexesImpl() = default;
22+
23+
explicit TOrderedColumnIndexesImpl(const ui32 columnsCount);
24+
explicit TOrderedColumnIndexesImpl(const std::vector<ui32>& columnIndexes);
25+
explicit TOrderedColumnIndexesImpl(std::vector<ui32>&& columnIndexes);
26+
27+
template <class TContainerWithIndexes>
28+
static std::vector<ui32> MergeColumnIdxs(const std::vector<TContainerWithIndexes>& sources) {
29+
class TIterator {
30+
private:
31+
std::vector<ui32>::const_iterator ItCurrent;
32+
std::vector<ui32>::const_iterator ItFinish;
33+
34+
public:
35+
TIterator(const std::vector<ui32>& indexes)
36+
: ItCurrent(indexes.begin())
37+
, ItFinish(indexes.end()) {
38+
}
39+
40+
bool operator<(const TIterator& item) const {
41+
return *ItCurrent > *item.ItCurrent;
42+
}
43+
44+
bool IsValid() const {
45+
return ItCurrent != ItFinish;
46+
}
47+
48+
ui32 operator*() const {
49+
return *ItCurrent;
50+
}
51+
52+
bool Next() {
53+
return ++ItCurrent != ItFinish;
54+
}
55+
};
56+
57+
std::vector<TIterator> heapToMerge;
58+
for (auto&& i : sources) {
59+
heapToMerge.emplace_back(TIterator(i.GetColumnIndexes()));
60+
if (!heapToMerge.back().IsValid()) {
61+
heapToMerge.pop_back();
62+
}
63+
}
64+
std::make_heap(heapToMerge.begin(), heapToMerge.end());
65+
std::vector<ui32> result;
66+
while (heapToMerge.size()) {
67+
std::pop_heap(heapToMerge.begin(), heapToMerge.end());
68+
if (result.empty() || result.back() != *heapToMerge.back()) {
69+
result.emplace_back(*heapToMerge.back());
70+
}
71+
if (!heapToMerge.back().Next()) {
72+
heapToMerge.pop_back();
73+
} else {
74+
std::push_heap(heapToMerge.begin(), heapToMerge.end());
75+
}
76+
}
77+
return result;
78+
}
79+
};
80+
81+
template <class TDataContainer>
82+
class TContainerWithIndexes: public TOrderedColumnIndexesImpl {
83+
private:
84+
using TBase = TOrderedColumnIndexesImpl;
85+
YDB_ACCESSOR_DEF(std::shared_ptr<TDataContainer>, Container);
86+
87+
public:
88+
TContainerWithIndexes() = default;
89+
90+
TContainerWithIndexes(const std::vector<ui32>& columnIndexes, const std::shared_ptr<TDataContainer>& container)
91+
: TBase(columnIndexes)
92+
, Container(container) {
93+
if (Container) {
94+
Y_ABORT_UNLESS((ui32)Container->num_columns() == columnIndexes.size());
95+
} else {
96+
Y_ABORT_UNLESS(!columnIndexes.size());
97+
}
98+
}
99+
100+
explicit TContainerWithIndexes(const std::shared_ptr<TDataContainer>& container)
101+
: TBase(TSimpleValidator::CheckNotNull(container)->num_columns())
102+
, Container(container) {
103+
}
104+
105+
TContainerWithIndexes<TDataContainer> BuildWithAnotherContainer(const std::shared_ptr<TDataContainer>& container) const {
106+
return TContainerWithIndexes<TDataContainer>(GetColumnIndexes(), container);
107+
}
108+
109+
bool operator!() const {
110+
return !HasContainer();
111+
}
112+
113+
bool HasContainer() const {
114+
return !!Container;
115+
}
116+
117+
const TDataContainer* operator->() const {
118+
return Container.get();
119+
}
120+
121+
};
122+
13123
class TColumnOperator {
14124
public:
15125
enum class EAbsentFieldPolicy {
@@ -59,7 +169,7 @@ class TColumnOperator {
59169
return *this;
60170
}
61171

62-
TConclusion<std::shared_ptr<arrow::RecordBatch>> AdaptIncomingToDestinationExt(const std::shared_ptr<arrow::RecordBatch>& incoming,
172+
TConclusion<TContainerWithIndexes<arrow::RecordBatch>> AdaptIncomingToDestinationExt(const std::shared_ptr<arrow::RecordBatch>& incoming,
63173
const TSchemaLiteView& dstSchema, const std::function<TConclusionStatus(const ui32, const i32)>& checker,
64174
const std::function<i32(const std::string&)>& nameResolver) const;
65175

ydb/core/kqp/ut/olap/helpers/typed_local.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ TString TTypedLocalHelper::GetTestTableSchema() const {
1515
if (TypeName) {
1616
result = R"(Columns { Name: "field" Type: ")" + TypeName + "\"}";
1717
}
18+
if (TypeName1) {
19+
result += R"(Columns { Name: "field1" Type: ")" + TypeName1 + "\"}";
20+
}
1821
result += R"(
1922
Columns { Name: "pk_int" Type: "Int64" NotNull: true }
2023
Columns { Name: "ts" Type: "Timestamp" }

ydb/core/kqp/ut/olap/helpers/typed_local.h

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ class TTypedLocalHelper: public Tests::NCS::THelper {
1717
private:
1818
using TBase = Tests::NCS::THelper;
1919
const TString TypeName;
20+
const TString TypeName1;
2021
TKikimrRunner& KikimrRunner;
2122
const TString TablePath;
2223
const TString TableName;
@@ -37,6 +38,18 @@ class TTypedLocalHelper: public Tests::NCS::THelper {
3738
SetShardingMethod("HASH_FUNCTION_CONSISTENCY_64");
3839
}
3940

41+
TTypedLocalHelper(const TString& typeName, const TString& typeName1, TKikimrRunner& kikimrRunner, const TString& tableName = "olapTable",
42+
const TString& storeName = "olapStore")
43+
: TBase(kikimrRunner.GetTestServer())
44+
, TypeName(typeName)
45+
, TypeName1(typeName1)
46+
, KikimrRunner(kikimrRunner)
47+
, TablePath(storeName.empty() ? "/Root/" + tableName : "/Root/" + storeName + "/" + tableName)
48+
, TableName(tableName)
49+
, StoreName(storeName) {
50+
SetShardingMethod("HASH_FUNCTION_CONSISTENCY_64");
51+
}
52+
4053
class TSimultaneousWritingSession {
4154
private:
4255
bool Finished = false;
@@ -54,13 +67,12 @@ class TTypedLocalHelper: public Tests::NCS::THelper {
5467
}
5568

5669
template <class TFiller>
57-
void FillTable(const TFiller& fillPolicy, const double pkKff = 0, const ui32 numRows = 800000) const {
58-
AFL_VERIFY(!Finished);
70+
void FillTable(const TString& fieldName, const TFiller& fillPolicy, const double pkKff = 0, const ui32 numRows = 800000) const {
5971
std::vector<NArrow::NConstruction::IArrayBuilder::TPtr> builders;
6072
builders.emplace_back(
6173
NArrow::NConstruction::TSimpleArrayConstructor<NArrow::NConstruction::TIntSeqFiller<arrow::Int64Type>>::BuildNotNullable(
6274
"pk_int", numRows * pkKff));
63-
builders.emplace_back(std::make_shared<NArrow::NConstruction::TSimpleArrayConstructor<TFiller>>("field", fillPolicy));
75+
builders.emplace_back(std::make_shared<NArrow::NConstruction::TSimpleArrayConstructor<TFiller>>(fieldName, fillPolicy));
6476
NArrow::NConstruction::TRecordBatchConstructor batchBuilder(builders);
6577
std::shared_ptr<arrow::RecordBatch> batch = batchBuilder.BuildBatch(numRows);
6678
SendDataViaActorSystem(TablePath, batch, Ydb::StatusIds::SUCCESS);

ydb/core/kqp/ut/olap/write_ut.cpp

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -202,11 +202,11 @@ Y_UNIT_TEST_SUITE(KqpOlapWrite) {
202202
TTypedLocalHelper helper("Utf8", kikimr);
203203
helper.CreateTestOlapTable();
204204
auto writeSession = helper.StartWriting("/Root/olapStore/olapTable");
205-
writeSession.FillTable(NArrow::NConstruction::TStringPoolFiller(1, 1, "aaa", 1), 0, 800000);
205+
writeSession.FillTable("field", NArrow::NConstruction::TStringPoolFiller(1, 1, "aaa", 1), 0, 800000);
206206
Sleep(TDuration::Seconds(1));
207-
writeSession.FillTable(NArrow::NConstruction::TStringPoolFiller(1, 1, "bbb", 1), 0.5, 800000);
207+
writeSession.FillTable("field", NArrow::NConstruction::TStringPoolFiller(1, 1, "bbb", 1), 0.5, 800000);
208208
Sleep(TDuration::Seconds(1));
209-
writeSession.FillTable(NArrow::NConstruction::TStringPoolFiller(1, 1, "ccc", 1), 0.75, 800000);
209+
writeSession.FillTable("field", NArrow::NConstruction::TStringPoolFiller(1, 1, "ccc", 1), 0.75, 800000);
210210
Sleep(TDuration::Seconds(1));
211211
writeSession.Finalize();
212212

@@ -228,6 +228,40 @@ Y_UNIT_TEST_SUITE(KqpOlapWrite) {
228228
UNIT_ASSERT_VALUES_EQUAL(GetUtf8(rows[2].at("field")), "ccc");
229229
}
230230

231+
Y_UNIT_TEST(MultiWriteInTimeDiffSchemas) {
232+
auto settings = TKikimrSettings().SetWithSampleTables(false);
233+
settings.AppConfig.MutableColumnShardConfig()->SetWritingBufferDurationMs(15000);
234+
TKikimrRunner kikimr(settings);
235+
Tests::NCommon::TLoggerInit(kikimr).Initialize();
236+
TTypedLocalHelper helper("Utf8", "Utf8", kikimr);
237+
helper.CreateTestOlapTable();
238+
auto writeGuard = helper.StartWriting("/Root/olapStore/olapTable");
239+
writeGuard.FillTable("field", NArrow::NConstruction::TStringPoolFiller(1, 1, "aaa", 1), 0, 800000);
240+
Sleep(TDuration::Seconds(1));
241+
writeGuard.FillTable("field1", NArrow::NConstruction::TStringPoolFiller(1, 1, "bbb", 1), 0.5, 800000);
242+
Sleep(TDuration::Seconds(1));
243+
writeGuard.FillTable("field", NArrow::NConstruction::TStringPoolFiller(1, 1, "ccc", 1), 0.75, 800000);
244+
Sleep(TDuration::Seconds(1));
245+
writeGuard.Finalize();
246+
247+
auto selectQuery = TString(R"(
248+
SELECT
249+
field, count(*) as count,
250+
FROM `/Root/olapStore/olapTable`
251+
GROUP BY field
252+
ORDER BY field
253+
)");
254+
255+
auto tableClient = kikimr.GetTableClient();
256+
auto rows = ExecuteScanQuery(tableClient, selectQuery);
257+
UNIT_ASSERT_VALUES_EQUAL(GetUint64(rows[0].at("count")), 200000);
258+
UNIT_ASSERT_VALUES_EQUAL(GetUtf8(rows[0].at("field")), "");
259+
UNIT_ASSERT_VALUES_EQUAL(GetUint64(rows[1].at("count")), 400000);
260+
UNIT_ASSERT_VALUES_EQUAL(GetUtf8(rows[1].at("field")), "aaa");
261+
UNIT_ASSERT_VALUES_EQUAL(GetUint64(rows[2].at("count")), 800000);
262+
UNIT_ASSERT_VALUES_EQUAL(GetUtf8(rows[2].at("field")), "ccc");
263+
}
264+
231265
Y_UNIT_TEST(WriteDeleteCleanGC) {
232266
auto csController = NKikimr::NYDBTest::TControllers::RegisterCSControllerGuard<NKikimr::NOlap::TWaitCompactionController>();
233267
csController->SetSmallSizeDetector(1000000);

ydb/core/testlib/cs_helper.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ void THelperSchemaless::SendDataViaActorSystem(TString testTable, std::shared_pt
9191
Cerr << "\n";
9292
}
9393
UNIT_ASSERT_VALUES_EQUAL(op.status(), expectedStatus);
94-
});
94+
});
9595

9696
TDispatchOptions options;
9797
options.CustomFinalCondition = [&]() {
@@ -228,7 +228,7 @@ void THelper::CreateSchemaOlapTablesWithStore(const TString tableSchema, TVector
228228
}
229229

230230
void THelper::CreateOlapTablesWithStore(TVector<TString> tableNames /*= {"olapTable"}*/, TString storeName /*= "olapStore"*/, ui32 storeShardsCount /*= 4*/, ui32 tableShardsCount /*= 3*/) {
231-
CreateSchemaOlapTablesWithStore(GetTestTableSchema(), tableNames, storeName, storeShardsCount, tableShardsCount);
231+
CreateSchemaOlapTablesWithStore(GetTestTableSchema(), tableNames, storeName, storeShardsCount, tableShardsCount);
232232
}
233233

234234
void THelper::CreateSchemaOlapTables(const TString tableSchema, TVector<TString> tableNames, ui32 tableShardsCount) {
@@ -254,15 +254,15 @@ void THelper::CreateSchemaOlapTables(const TString tableSchema, TVector<TString>
254254
}
255255

256256
void THelper::CreateOlapTables(TVector<TString> tableNames /*= {"olapTable"}*/, ui32 tableShardsCount /*= 3*/) {
257-
CreateSchemaOlapTables(GetTestTableSchema(), tableNames, tableShardsCount);
257+
CreateSchemaOlapTables(GetTestTableSchema(), tableNames, tableShardsCount);
258258
}
259259

260260
// Clickbench table
261261

262262
std::shared_ptr<arrow::Schema> TCickBenchHelper::GetArrowSchema() const {
263263
return std::make_shared<arrow::Schema>(
264264
std::vector<std::shared_ptr<arrow::Field>> {
265-
arrow::field("WatchID", arrow::int64(), false),
265+
arrow::field("WatchID", arrow::int64(), false),
266266
arrow::field("JavaEnable", arrow::int16(), false),
267267
arrow::field("Title", arrow::utf8(), false),
268268
arrow::field("GoodEvent", arrow::int16(), false),
@@ -430,7 +430,7 @@ std::shared_ptr<arrow::RecordBatch> TCickBenchHelper::TestArrowBatch(ui64, ui64
430430
std::shared_ptr<arrow::Schema> TTableWithNullsHelper::GetArrowSchema() const {
431431
return std::make_shared<arrow::Schema>(
432432
std::vector<std::shared_ptr<arrow::Field>>{
433-
arrow::field("id", arrow::int32(), false),
433+
arrow::field("id", arrow::int32(), false),
434434
arrow::field("resource_id", arrow::utf8()),
435435
arrow::field("level", arrow::int32()),
436436
arrow::field("binary_str", arrow::binary()),

ydb/core/tx/columnshard/engines/scheme/index_info.cpp

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,14 +165,28 @@ std::shared_ptr<arrow::Field> TIndexInfo::GetColumnFieldVerified(const ui32 colu
165165
}
166166

167167
std::shared_ptr<arrow::Schema> TIndexInfo::GetColumnsSchema(const std::set<ui32>& columnIds) const {
168-
Y_ABORT_UNLESS(columnIds.size());
168+
AFL_VERIFY(columnIds.size());
169169
std::vector<std::shared_ptr<arrow::Field>> fields;
170170
for (auto&& i : columnIds) {
171171
fields.emplace_back(GetColumnFieldVerified(i));
172172
}
173173
return std::make_shared<arrow::Schema>(fields);
174174
}
175175

176+
std::shared_ptr<arrow::Schema> TIndexInfo::GetColumnsSchemaByOrderedIndexes(const std::vector<ui32>& columnIdxs) const {
177+
AFL_VERIFY(columnIdxs.size());
178+
std::vector<std::shared_ptr<arrow::Field>> fields;
179+
std::optional<ui32> predColumnIdx;
180+
for (auto&& i : columnIdxs) {
181+
if (predColumnIdx) {
182+
AFL_VERIFY(*predColumnIdx < i);
183+
}
184+
predColumnIdx = i;
185+
fields.emplace_back(ArrowSchemaWithSpecials()->GetFieldByIndexVerified(i));
186+
}
187+
return std::make_shared<arrow::Schema>(fields);
188+
}
189+
176190
std::shared_ptr<arrow::Schema> TIndexInfo::GetColumnSchema(const ui32 columnId) const {
177191
return GetColumnsSchema({ columnId });
178192
}
@@ -621,4 +635,18 @@ TIndexInfo TIndexInfo::BuildDefault() {
621635
return result;
622636
}
623637

638+
TConclusion<std::shared_ptr<arrow::Array>> TIndexInfo::BuildDefaultColumn(
639+
const ui32 fieldIndex, const ui32 rowsCount, const bool force) const {
640+
auto defaultValue = GetColumnExternalDefaultValueByIndexVerified(fieldIndex);
641+
auto f = ArrowSchemaWithSpecials()->GetFieldByIndexVerified(fieldIndex);
642+
if (!defaultValue && !IsNullableVerifiedByIndex(fieldIndex)) {
643+
if (force) {
644+
defaultValue = NArrow::DefaultScalar(f->type());
645+
} else {
646+
return TConclusionStatus::Fail("not nullable field with no default: " + f->name());
647+
}
648+
}
649+
return NArrow::TThreadSimpleArraysCache::Get(f->type(), defaultValue, rowsCount);
650+
}
651+
624652
} // namespace NKikimr::NOlap

0 commit comments

Comments
 (0)