Skip to content

Commit 386cc57

Browse files
ivanmorozov333ivanmorozov333
andauthored
compaction improving for different pages for each column (#18163)
Co-authored-by: ivanmorozov333 <imorozov333@ya.ru>
1 parent 3be05eb commit 386cc57

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

80 files changed

+1935
-1143
lines changed

.github/config/muted_ya.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ ydb/core/kqp/ut/olap KqpOlapJson.CompactionVariants
2626
ydb/core/kqp/ut/olap KqpOlapJson.DuplicationCompactionVariants
2727
ydb/core/kqp/ut/olap KqpOlapJson.SwitchAccessorCompactionVariants
2828
ydb/core/kqp/ut/olap KqpOlapWrite.TierDraftsGCWithRestart
29-
ydb/core/kqp/ut/olap KqpOlapSysView.StatsSysViewEnumStringBytes
29+
3030
ydb/core/kqp/ut/olap [*/*] chunk chunk
3131
ydb/core/kqp/ut/query KqpAnalyze.AnalyzeTable+ColumnStore
3232
ydb/core/kqp/ut/query KqpAnalyze.AnalyzeTable-ColumnStore
@@ -130,6 +130,7 @@ ydb/tests/olap/ttl_tiering [data_migration_when_alter_ttl.py] chunk chunk
130130
ydb/tests/olap/ttl_tiering [ttl_delete_s3.py] chunk chunk
131131
ydb/tests/olap/ttl_tiering data_migration_when_alter_ttl.py.TestDataMigrationWhenAlterTtl.test
132132
ydb/tests/olap/ttl_tiering sole chunk chunk
133+
ydb/tests/olap/ttl_tiering ttl_unavailable_s3.py.TestUnavailableS3.test
133134
ydb/tests/olap/ttl_tiering ttl_delete_s3.py.TestDeleteS3Ttl.test_delete_s3_tiering
134135
ydb/tests/olap/ttl_tiering ttl_delete_s3.py.TestDeleteTtl.test_ttl_delete
135136
ydb/tests/sql/large sole chunk chunk

ydb/core/formats/arrow/accessor/abstract/accessor.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,13 @@ class IChunkedArray {
9696
return Addresses.size();
9797
}
9898

99-
ui32 GetLocalIndex(const ui32 position) const {
100-
AFL_VERIFY(Contains(position))("pos", position)("start", GlobalStartPosition);
101-
return position - GlobalStartPosition;
99+
ui32 GetLocalIndex(const ui32 global) const {
100+
AFL_VERIFY(Contains(global))("pos", global)("start", GlobalStartPosition);
101+
return global - GlobalStartPosition;
102+
}
103+
104+
ui32 GetGlobalIndex(const ui32 local) const {
105+
return local + GlobalStartPosition;
102106
}
103107

104108
bool Contains(const ui32 position) const {

ydb/core/formats/arrow/accessor/abstract/constructor.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,12 @@ TConstructorContainer TConstructorContainer::GetDefaultConstructor() {
88
return result;
99
}
1010

11+
TString IConstructor::SerializeToString(const std::shared_ptr<IChunkedArray>& columnData, const TChunkConstructionData& externalInfo) const {
12+
AFL_VERIFY(columnData);
13+
AFL_VERIFY(columnData->GetType() == Type)("column", columnData->GetType())("current", Type);
14+
AFL_VERIFY(columnData->GetDataType()->Equals(externalInfo.GetColumnType()))("column", columnData->GetDataType()->ToString())(
15+
"external", externalInfo.GetColumnType()->ToString());
16+
return DoSerializeToString(columnData, externalInfo);
17+
}
18+
1119
}

ydb/core/formats/arrow/accessor/abstract/constructor.h

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,7 @@ class IConstructor {
4343

4444
virtual ~IConstructor() = default;
4545

46-
TString SerializeToString(const std::shared_ptr<IChunkedArray>& columnData, const TChunkConstructionData& externalInfo) const {
47-
AFL_VERIFY(columnData);
48-
AFL_VERIFY(columnData->GetType() == Type)("column", columnData->GetType())("current", Type);
49-
return DoSerializeToString(columnData, externalInfo);
50-
}
46+
TString SerializeToString(const std::shared_ptr<IChunkedArray>& columnData, const TChunkConstructionData& externalInfo) const;
5147

5248
bool IsEqualWithSameTypeTo(const IConstructor& item) const {
5349
return DoIsEqualWithSameTypeTo(item);

ydb/core/formats/arrow/accessor/sparsed/accessor.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,10 @@ class TSparsedArray: public IChunkedArray {
194194
const std::shared_ptr<arrow::Scalar>& defaultValue, const std::shared_ptr<arrow::DataType>& type, const ui32 recordsCount);
195195

196196
public:
197+
static EType GetTypeStatic() {
198+
return EType::SparsedArray;
199+
}
200+
197201
virtual void Reallocate() override;
198202

199203
static std::shared_ptr<TSparsedArray> BuildFalseArrayUI8(const ui32 recordsCount) {
@@ -223,6 +227,10 @@ class TSparsedArray: public IChunkedArray {
223227
return Record;
224228
}
225229

230+
const TSparsedArrayChunk& GetSparsedChunk() const {
231+
return Record;
232+
}
233+
226234
virtual std::shared_ptr<arrow::Scalar> DoGetScalar(const ui32 index) const override {
227235
return Record.GetScalar(index);
228236
}

ydb/core/formats/arrow/save_load/loader.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "loader.h"
22

3+
#include <ydb/library/formats/arrow/switch/switch_type.h>
34
#include <ydb/library/formats/arrow/validation/validation.h>
45

56
namespace NKikimr::NArrow::NAccessor {
@@ -28,8 +29,7 @@ const std::shared_ptr<arrow::Field>& TColumnLoader::GetField() const {
2829
return ResultField;
2930
}
3031

31-
TChunkConstructionData TColumnLoader::BuildAccessorContext(
32-
const ui32 recordsCount, const std::optional<ui32>& notNullCount) const {
32+
TChunkConstructionData TColumnLoader::BuildAccessorContext(const ui32 recordsCount, const std::optional<ui32>& notNullCount) const {
3333
return TChunkConstructionData(recordsCount, DefaultValue, ResultField->type(), Serializer.GetObjectPtr(), notNullCount);
3434
}
3535

@@ -38,7 +38,8 @@ TConclusion<std::shared_ptr<IChunkedArray>> TColumnLoader::ApplyConclusion(
3838
return BuildAccessor(dataStr, BuildAccessorContext(recordsCount, notNullCount));
3939
}
4040

41-
std::shared_ptr<IChunkedArray> TColumnLoader::ApplyVerified(const TString& dataStr, const ui32 recordsCount, const std::optional<ui32>& notNullCount) const {
41+
std::shared_ptr<IChunkedArray> TColumnLoader::ApplyVerified(
42+
const TString& dataStr, const ui32 recordsCount, const std::optional<ui32>& notNullCount) const {
4243
return BuildAccessor(dataStr, BuildAccessorContext(recordsCount, notNullCount)).DetachResult();
4344
}
4445

@@ -63,4 +64,16 @@ bool TColumnLoader::IsEqualTo(const TColumnLoader& item) const {
6364
return true;
6465
}
6566

67+
std::optional<NSplitter::TSimpleSerializationStat> TColumnLoader::TryBuildColumnStat() const {
68+
std::optional<NSplitter::TSimpleSerializationStat> result;
69+
SwitchType(ResultField->type()->id(), [&](const auto switcher) {
70+
if constexpr (switcher.IsCType) {
71+
using CType = typename decltype(switcher)::ValueType;
72+
result = NSplitter::TSimpleSerializationStat(std::max<ui32>(1, sizeof(CType) / 2), 1, sizeof(CType));
73+
}
74+
return true;
75+
});
76+
return result;
77+
}
78+
6679
} // namespace NKikimr::NArrow::NAccessor

ydb/core/formats/arrow/save_load/loader.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <ydb/core/formats/arrow/serializer/abstract.h>
44

55
#include <ydb/library/accessor/accessor.h>
6+
#include <ydb/library/formats/arrow/splitter/stats.h>
67

78
#include <contrib/libs/apache/arrow/cpp/src/arrow/type.h>
89

@@ -19,15 +20,16 @@ class TColumnLoader {
1920
TConclusion<std::shared_ptr<IChunkedArray>> BuildAccessor(const TString& originalData, const TChunkConstructionData& chunkData) const;
2021

2122
public:
23+
std::optional<NSplitter::TSimpleSerializationStat> TryBuildColumnStat() const;
24+
2225
std::shared_ptr<IChunkedArray> BuildDefaultAccessor(const ui32 recordsCount) const;
2326

2427
bool IsEqualTo(const TColumnLoader& item) const;
2528

2629
TString DebugString() const;
2730

28-
TColumnLoader(const NSerialization::TSerializerContainer& serializer,
29-
const NAccessor::TConstructorContainer& accessorConstructor, const std::shared_ptr<arrow::Field>& resultField,
30-
const std::shared_ptr<arrow::Scalar>& defaultValue, const ui32 columnId);
31+
TColumnLoader(const NSerialization::TSerializerContainer& serializer, const NAccessor::TConstructorContainer& accessorConstructor,
32+
const std::shared_ptr<arrow::Field>& resultField, const std::shared_ptr<arrow::Scalar>& defaultValue, const ui32 columnId);
3133

3234
ui32 GetColumnId() const {
3335
return ColumnId;
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,66 @@
11
#include "abstract.h"
2+
#include <util/string/join.h>
23

34
namespace NKikimr::NKqp {
45

6+
TConclusionStatus ICommand::DeserializeFromString(const TString& description) {
7+
try {
8+
auto lines = StringSplitter(description).SplitBySet("\n").ToList<TString>();
9+
const std::set<TString> props = GetCommandProperties();
10+
TString currentProperty;
11+
std::vector<TString> freeArguments;
12+
THashMap<TString, TString> properties;
13+
for (auto&& l : lines) {
14+
l = Strip(l);
15+
if (!l) {
16+
continue;
17+
}
18+
for (auto&& c : props) {
19+
if (l.StartsWith(c)) {
20+
currentProperty = c;
21+
l = Strip(l.substr(c.size()));
22+
}
23+
if (l.StartsWith(":") || l.StartsWith("=")) {
24+
l = Strip(l.substr(1));
25+
}
26+
}
27+
if (!l) {
28+
continue;
29+
}
30+
if (!currentProperty) {
31+
freeArguments.emplace_back(l);
32+
} else {
33+
properties[currentProperty] += l;
34+
}
35+
}
36+
37+
TPropertiesCollection collection(freeArguments, properties);
38+
auto result = DeserializeProperties(collection);
39+
if (result.IsFail()) {
40+
return TConclusionStatus::Fail(result.GetErrorMessage() + ":\n" + collection.DebugString());
41+
}
42+
return TConclusionStatus::Success();
43+
} catch (...) {
44+
return TConclusionStatus::Fail("exception on ICommand::DeserializeFromString: " + CurrentExceptionMessage());
45+
}
46+
}
47+
48+
TString TPropertiesCollection::DebugString() const {
49+
TStringBuilder sb;
50+
sb << "FREE_ARGUMENTS(" << FreeArguments.size() << "):" << Endl;
51+
for (auto&& i : FreeArguments) {
52+
sb << " " << i << Endl;
53+
}
54+
55+
sb << "PROPERTIES(" << Properties.size() << "):" << Endl;
56+
for (auto&& i : Properties) {
57+
sb << " " << i.first << ":" << i.second << Endl;
58+
}
59+
return sb;
60+
}
61+
62+
TString TPropertiesCollection::JoinFreeArguments(const TString& delimiter /*= "\n"*/) const {
63+
return JoinSeq(delimiter, FreeArguments);
64+
}
65+
566
} // namespace NKikimr::NKqp

ydb/core/kqp/ut/olap/combinatory/abstract.h

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,74 @@
33

44
namespace NKikimr::NKqp {
55

6+
class TPropertiesCollection {
7+
private:
8+
std::vector<TString> FreeArguments;
9+
THashMap<TString, TString> Properties;
10+
11+
public:
12+
TString DebugString() const;
13+
14+
TString JoinFreeArguments(const TString& delimiter = "\n") const;
15+
16+
TPropertiesCollection(const std::vector<TString>& freeArgs, const THashMap<TString, TString>& props)
17+
: FreeArguments(freeArgs)
18+
, Properties(props) {
19+
}
20+
21+
std::optional<TString> GetOptional(const TString& propertyName) const {
22+
auto it = Properties.find(propertyName);
23+
if (it == Properties.end()) {
24+
return std::nullopt;
25+
}
26+
return it->second;
27+
}
28+
29+
TString GetVerified(const TString& propertyName) const {
30+
auto it = Properties.find(propertyName);
31+
AFL_VERIFY(it != Properties.end())("name", propertyName)("props", DebugString());
32+
return it->second;
33+
}
34+
35+
ui32 GetFreeArgumentsCount() const {
36+
return FreeArguments.size();
37+
}
38+
39+
std::optional<TString> GetFreeArgumentOptional(const ui32 idx) const {
40+
if (idx < FreeArguments.size()) {
41+
return std::nullopt;
42+
}
43+
return FreeArguments[idx];
44+
}
45+
46+
TString GetFreeArgumentVerified(const ui32 idx) const {
47+
AFL_VERIFY(idx < FreeArguments.size())("idx", idx)("props", DebugString());
48+
return FreeArguments[idx];
49+
}
50+
};
51+
652
class ICommand {
753
private:
854
virtual TConclusionStatus DoExecute(TKikimrRunner& kikimr) = 0;
55+
virtual std::set<TString> DoGetCommandProperties() const {
56+
return {};
57+
}
58+
virtual TConclusionStatus DoDeserializeProperties(const TPropertiesCollection& /*props*/) {
59+
return TConclusionStatus::Success();
60+
}
61+
TConclusionStatus DeserializeProperties(const TPropertiesCollection& props) {
62+
return DoDeserializeProperties(props);
63+
}
964

1065
public:
1166
virtual ~ICommand() = default;
1267

68+
std::set<TString> GetCommandProperties() const {
69+
return DoGetCommandProperties();
70+
}
71+
72+
TConclusionStatus DeserializeFromString(const TString& description);
73+
1374
TConclusionStatus Execute(TKikimrRunner& kikimr) {
1475
return DoExecute(kikimr);
1576
}

ydb/core/kqp/ut/olap/combinatory/bulk_upsert.cpp

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,38 @@
77

88
namespace NKikimr::NKqp {
99

10-
bool TBulkUpsertCommand::DeserializeFromString(const TString& info) {
11-
auto lines = StringSplitter(info).SplitBySet("\n").SkipEmpty().ToList<TString>();
12-
if (lines.size() < 2 || lines.size() > 3) {
13-
return false;
10+
TConclusionStatus TBulkUpsertCommand::DoExecute(TKikimrRunner& kikimr) {
11+
if (ArrowBatch->num_rows() < PartsCount) {
12+
return TConclusionStatus::Fail(
13+
"not enough records(" + ::ToString(ArrowBatch->num_rows()) + ") for split in " + ::ToString(PartsCount) + " chunks");
1414
}
15-
TableName = Strip(lines[0]);
16-
ArrowBatch = Base64Decode(Strip(lines[1]));
17-
AFL_VERIFY(!!ArrowBatch);
18-
if (lines.size() == 3) {
19-
if (!Ydb::StatusIds_StatusCode_Parse(Strip(lines[2]), &ExpectedCode)) {
20-
return false;
21-
}
22-
// if (lines[2] == "SUCCESS") {
23-
// } else if (lines[2] = "INTERNAL_ERROR") {
24-
// ExpectedCode = Ydb::StatusIds::INTERNAL_ERROR;
25-
// } else if (lines[2] == "BAD_REQUEST") {
26-
// ExpectedCode = Ydb::StatusIds::BAD_REQUEST;
27-
// } else {
28-
// return false;
29-
// }
15+
ui32 cursor = 0;
16+
for (ui32 i = 0; i < PartsCount; ++i) {
17+
const ui32 size = (i + 1 != PartsCount) ? (ArrowBatch->num_rows() / PartsCount) : (ArrowBatch->num_rows() - cursor);
18+
TLocalHelper lHelper(kikimr);
19+
lHelper.SendDataViaActorSystem(TableName, ArrowBatch->Slice(cursor, size), ExpectedCode);
20+
cursor += size;
3021
}
31-
return true;
22+
AFL_VERIFY(cursor == ArrowBatch->num_rows());
23+
return TConclusionStatus::Success();
3224
}
3325

34-
TConclusionStatus TBulkUpsertCommand::DoExecute(TKikimrRunner& kikimr) {
35-
TLocalHelper lHelper(kikimr);
36-
lHelper.SendDataViaActorSystem(
37-
TableName, NArrow::TStatusValidator::GetValid(NArrow::NSerialization::TNativeSerializer().Deserialize(ArrowBatch)), ExpectedCode);
26+
TConclusionStatus TBulkUpsertCommand::DoDeserializeProperties(const TPropertiesCollection& props) {
27+
if (props.GetFreeArgumentsCount() != 2) {
28+
return TConclusionStatus::Fail("incorrect free arguments count for BULK_UPSERTcommand");
29+
}
30+
TableName = props.GetFreeArgumentVerified(0);
31+
ArrowBatch = NArrow::TStatusValidator::GetValid(NArrow::NSerialization::TNativeSerializer().Deserialize(Base64Decode(props.GetFreeArgumentVerified(1))));
32+
if (auto value = props.GetOptional("EXPECT_STATUS")) {
33+
if (!Ydb::StatusIds_StatusCode_Parse(*value, &ExpectedCode)) {
34+
return TConclusionStatus::Fail("cannot parse EXPECT_STATUS from " + *value);
35+
}
36+
}
37+
if (auto value = props.GetOptional("PARTS_COUNT")) {
38+
if (!TryFromString<ui32>(*value, PartsCount)) {
39+
return TConclusionStatus::Fail("cannot parse PARTS_COUNT from " + *value);
40+
}
41+
}
3842
return TConclusionStatus::Success();
3943
}
4044

0 commit comments

Comments
 (0)