Skip to content

Multicolumn sparsed test #8284

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions ydb/core/formats/arrow/simple_builder/array.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,27 +52,30 @@ class TSimpleArrayConstructor: public IArrayBuilder {
using TSelf = TSimpleArrayConstructor<TFiller>;
using TBuilder = typename arrow::TypeTraits<typename TFiller::TValue>::BuilderType;
const TFiller Filler;
ui32 ShiftValue = 0;

TSimpleArrayConstructor(const TString& fieldName, bool nullable, const TFiller& filler)
TSimpleArrayConstructor(const TString& fieldName, bool nullable, const TFiller& filler, ui32 shiftValue = 0)
: TBase(fieldName, nullable)
, Filler(filler)
, ShiftValue(shiftValue)
{
}
protected:
virtual std::shared_ptr<arrow::Array> DoBuildArray(const ui32 recordsCount) const override {
TBuilder fBuilder = TFillerBuilderConstructor<typename TFiller::TValue>::Construct();
Y_ABORT_UNLESS(fBuilder.Reserve(recordsCount).ok());
for (ui32 i = 0; i < recordsCount; ++i) {
Y_ABORT_UNLESS(fBuilder.Append(Filler.GetValue(i)).ok());
Y_ABORT_UNLESS(fBuilder.Append(Filler.GetValue(i + ShiftValue)).ok());
}
return *fBuilder.Finish();
}


public:
TSimpleArrayConstructor(const TString& fieldName, const TFiller& filler = TFiller())
TSimpleArrayConstructor(const TString& fieldName, const TFiller& filler = TFiller(), ui32 shiftValue = 0)
: TBase(fieldName)
, Filler(filler)
, ShiftValue(shiftValue)
{
}

Expand Down
63 changes: 63 additions & 0 deletions ydb/core/formats/arrow/simple_builder/filler.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@
#include <contrib/libs/apache/arrow/cpp/src/arrow/type.h>
#include <contrib/libs/apache/arrow/cpp/src/arrow/type_traits.h>
#include <contrib/libs/apache/arrow/cpp/src/arrow/util/string_view.h>

#include <library/cpp/testing/unittest/registar.h>

#include <util/generic/string.h>
#include <util/system/types.h>
#include <util/random/random.h>

namespace NKikimr::NArrow::NConstruction {

Expand All @@ -25,6 +29,65 @@ class TIntSeqFiller {
}
};

class TStringType : public arrow::StringType {
public:
using c_type = TString;
};

template <class TArrowType>
class TPoolFiller {
private:
using CType = typename TArrowType::c_type;

private:
std::vector<CType> Data;

public:
using TValue = std::conditional_t<std::is_same_v<TArrowType, TStringType>, arrow::StringType, TArrowType>;
using ValueType = std::conditional_t<std::is_same_v<TArrowType, TStringType>, arrow::util::string_view, CType>;

static CType GetRandomNumberNotEqDef(CType defaultValue) {
CType result;
do {
result = RandomNumber<double>() * std::numeric_limits<CType>::max();
} while (result == defaultValue);
return result;
}

TPoolFiller(const ui32 poolSize, const CType defaultValue, const double defaultValueFrq) {
for (ui32 i = 0; i < poolSize; ++i) {
if (RandomNumber<double>() < defaultValueFrq) {
Data.emplace_back(defaultValue);
} else {
Data.emplace_back(GetRandomNumberNotEqDef(defaultValue));
}
}
}

TPoolFiller(const ui32 poolSize, const ui32 strLen, const TString& defaultValue, const double defaultValueFrq) {
for (ui32 i = 0; i < poolSize; ++i) {
if (RandomNumber<double>() < defaultValueFrq) {
Data.emplace_back(defaultValue);
} else {
Data.emplace_back(NUnitTest::RandomString(strLen, i));
}
}
}

template<class Type>
const ValueType Convert(const Type& v) const {
return v;
}

const ValueType Convert(const TString& str) const {
return arrow::util::string_view(str.data(), str.size());
}

ValueType GetValue(const ui32 idx) const {
return Convert(Data[(2 + 7 * idx) % Data.size()]);
}
};

template <class TArrowInt>
class TIntConstFiller {
public:
Expand Down
25 changes: 25 additions & 0 deletions ydb/core/kqp/ut/olap/helpers/typed_local.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,31 @@ TString TTypedLocalHelper::GetTestTableSchema() const {
return result;
}

TString TTypedLocalHelper::GetMultiColumnTestTableSchema(ui32 reps) const {
TString result;
result += R"(
Columns { Name: "pk_int" Type: "Int64" NotNull: true }
Columns { Name: "ts" Type: "Timestamp" }
)";
for (ui32 i = 0; i < reps; i++) {
TString strNum = ToString(i);
result += "Columns {Name: \"field_utf" + strNum + "\" Type: \"Utf8\"}\n";
result += "Columns {Name: \"field_int" + strNum + "\" Type: \"Int64\"}\n";
result += "Columns {Name: \"field_uint" + strNum + "\" Type: \"Uint8\"}\n";
result += "Columns {Name: \"field_float" + strNum + "\" Type: \"Float\"}\n";
result += "Columns {Name: \"field_double" + strNum + "\" Type: \"Double\"}\n";
}
result += R"(
KeyColumnNames: "pk_int"
Engine: COLUMN_ENGINE_REPLACING_TIMESERIES
)";
return result;
}

void TTypedLocalHelper::CreateMultiColumnOlapTableWithStore(ui32 reps, ui32 storeShardsCount, ui32 tableShardsCount) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

а это ты скопировал откуда-то? нельзя переиспользовать?

CreateSchemaOlapTableWithStore(GetMultiColumnTestTableSchema(reps), TableName, "olapStore", storeShardsCount, tableShardsCount);
}

void TTypedLocalHelper::ExecuteSchemeQuery(const TString& alterQuery, const NYdb::EStatus expectedStatus /*= EStatus::SUCCESS*/) const {
auto session = KikimrRunner.GetTableClient().CreateSession().GetValueSync().GetSession();
auto alterResult = session.ExecuteSchemeQuery(alterQuery).GetValueSync();
Expand Down
31 changes: 30 additions & 1 deletion ydb/core/kqp/ut/olap/helpers/typed_local.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

#include <ydb/public/sdk/cpp/client/ydb_types/status_codes.h>

#include <contrib/libs/apache/arrow/cpp/src/arrow/array/builder_binary.h>

#include <library/cpp/json/writer/json_value.h>

namespace NKikimr::NKqp {
Expand Down Expand Up @@ -83,11 +85,38 @@ class TTypedLocalHelper: public Tests::NCS::THelper {
TBase::SendDataViaActorSystem(TablePath, batch);
}

void FillMultiColumnTable(ui32 repCount, const double pkKff = 0, const ui32 numRows = 800000) const {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

вот тут генерацию батча можно хорошо параметризировать и переиспользовать...

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Сейчас есть такой объект как NTest::TTestColumn
И везде сейчас кастомные построения Builder, etc...
Хочется, чтобы в TTestColumn можно было задать способ генерации данных (возможно, не совпадающий с метаданными, описывающими схему, но по умолчанию берущий данные из схемы)
И тогда класс принимает на вход список TTestColumn и может сделать вариативно - построить arrow::RecordBatch, сделать запрос на построение схемы, построить arrow::Schema
для начала - самое то. тогда можно будет сделать аккуратнее и убрать код в следующих местах:
ut_columnshard_schema.cpp:MakeData
columnshard_ut_common.cpp:MakeTestBlob
core/testlib/cs_helper.cpp:NKikimr::Tests::NCS::THelper::TestArrowBatch
TCickBenchHelper::TestArrowBatch

в общем - поищи - там по тестам везде постоянно генерят данные, строят схемы -- хочется все это для всех тестов унифицировать, используя TTestColumn, в котором можно указать, например, что
генерация данных одна, а типа данных для схемы - другой. чтобы это везде переиспользовалось потом

const double frq = 0.9;
NArrow::NConstruction::TPoolFiller<arrow::Int64Type> int64Pool(1000, 0, frq);
NArrow::NConstruction::TPoolFiller<arrow::UInt8Type> uint8Pool(1000, 0, frq);
NArrow::NConstruction::TPoolFiller<arrow::FloatType> floatPool(1000, 0, frq);
NArrow::NConstruction::TPoolFiller<arrow::DoubleType> doublePool(1000, 0, frq);
NArrow::NConstruction::TPoolFiller<NKikimr::NArrow::NConstruction::TStringType> utfPool(1000, 52, "abcde", frq);

std::vector<NArrow::NConstruction::IArrayBuilder::TPtr> builders;
builders.emplace_back(NArrow::NConstruction::TSimpleArrayConstructor<NArrow::NConstruction::TIntSeqFiller<arrow::Int64Type>>::BuildNotNullable("pk_int", numRows * pkKff));
for (ui32 i = 0; i < repCount; i++) {
TString repStr = ToString(i);
builders.emplace_back(std::make_shared<NArrow::NConstruction::TSimpleArrayConstructor<NArrow::NConstruction::TPoolFiller<NKikimr::NArrow::NConstruction::TStringType>>>("field_utf" + repStr, utfPool, i));
builders.emplace_back(std::make_shared<NArrow::NConstruction::TSimpleArrayConstructor<NArrow::NConstruction::TPoolFiller<arrow::Int64Type>>>("field_int" + repStr, int64Pool, i));
builders.emplace_back(std::make_shared<NArrow::NConstruction::TSimpleArrayConstructor<NArrow::NConstruction::TPoolFiller<arrow::UInt8Type>>>("field_uint" + repStr, uint8Pool, i));
builders.emplace_back(std::make_shared<NArrow::NConstruction::TSimpleArrayConstructor<NArrow::NConstruction::TPoolFiller<arrow::FloatType>>>("field_float" + repStr, floatPool, i));
builders.emplace_back(std::make_shared<NArrow::NConstruction::TSimpleArrayConstructor<NArrow::NConstruction::TPoolFiller<arrow::DoubleType>>>("field_double" + repStr, doublePool, i));
}
NArrow::NConstruction::TRecordBatchConstructor batchBuilder(builders);
std::shared_ptr<arrow::RecordBatch> batch = batchBuilder.BuildBatch(numRows);
TBase::SendDataViaActorSystem(TablePath, batch);
}


void FillPKOnly(const double pkKff = 0, const ui32 numRows = 800000) const;

void CreateTestOlapTable(ui32 storeShardsCount = 4, ui32 tableShardsCount = 3) {
CreateOlapTableWithStore(TableName, StoreName, storeShardsCount, tableShardsCount);
}

TString GetMultiColumnTestTableSchema(ui32 reps) const;
void CreateMultiColumnOlapTableWithStore(ui32 reps, ui32 storeShardsCount = 4, ui32 tableShardsCount = 3);
};

}
}
Loading
Loading