Skip to content

change partitioning in tpch #17416

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ydb/apps/ydb/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
* Added `--scale` option to `ydb workload tpch init` and `ydb workload tpcds init` commands. Sets the percentage of the benchmark's data size and workload to use, relative to full scale.
* Added "--no-discovery" option. It allows to skip discovery and use user provided endpoint to connect to YDB cluster.
* Added `--retries` to `ydb workload <clickbenh|tpch|tpcds> run` command.
* Added `--partition-size` param to `ydb workload <clickbench/tpcds/tpch> init`.
Expand Down
28 changes: 20 additions & 8 deletions ydb/library/workload/benchmark_base/workload.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,22 @@ const TString TWorkloadGeneratorBase::CsvFormatString = [] () {
return settings.SerializeAsString();
} ();

namespace {

TString KeysList(const NJson::TJsonValue& table, const TString& key) {
TVector<TStringBuf> keysV;
for (const auto& k: table[key].GetArray()) {
keysV.emplace_back(k.GetString());
}
return JoinSeq(", ", keysV);
}

}

ui32 TWorkloadGeneratorBase::GetDefaultPartitionsCount(const TString& /*tableName*/) const {
return 64;
}

void TWorkloadGeneratorBase::GenerateDDLForTable(IOutputStream& result, const NJson::TJsonValue& table, bool single) const {
auto specialTypes = GetSpecialDataTypes();
specialTypes["string_type"] = Params.GetStringType();
Expand Down Expand Up @@ -65,11 +81,7 @@ void TWorkloadGeneratorBase::GenerateDDLForTable(IOutputStream& result, const NJ
}
}
result << JoinSeq(",\n", columns);
TVector<TStringBuf> keysV;
for (const auto& k: table["primary_key"].GetArray()) {
keysV.emplace_back(k.GetString());
}
const TString keys = JoinSeq(", ", keysV);
const auto keys = KeysList(table, "primary_key");
if (Params.GetStoreType() == TWorkloadBaseParams::EStoreType::ExternalS3) {
result << Endl;
} else {
Expand All @@ -78,7 +90,7 @@ void TWorkloadGeneratorBase::GenerateDDLForTable(IOutputStream& result, const NJ
result << ")" << Endl;

if (Params.GetStoreType() == TWorkloadBaseParams::EStoreType::Column) {
result << "PARTITION BY HASH (" << keys << ")" << Endl;
result << "PARTITION BY HASH (" << (table.Has("partition_by") ? KeysList(table, "partition_by") : keys) << ")" << Endl;
}

result << "WITH (" << Endl;
Expand All @@ -89,12 +101,12 @@ void TWorkloadGeneratorBase::GenerateDDLForTable(IOutputStream& result, const NJ
break;
case TWorkloadBaseParams::EStoreType::Column:
result << " STORE = COLUMN," << Endl;
result << " AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = " << table["partitioning"].GetUIntegerSafe(64) << Endl;
result << " AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = " << table["partitioning"].GetUIntegerSafe(GetDefaultPartitionsCount(tableName)) << Endl;
break;
case TWorkloadBaseParams::EStoreType::Row:
result << " STORE = ROW," << Endl;
result << " AUTO_PARTITIONING_PARTITION_SIZE_MB = " << Params.GetPartitionSizeMb() << ", " << Endl;
result << " AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = " << table["partitioning"].GetUIntegerSafe(64) << Endl;
result << " AUTO_PARTITIONING_MIN_PARTITIONS_COUNT = " << table["partitioning"].GetUIntegerSafe(GetDefaultPartitionsCount(tableName)) << Endl;
}
result << ");" << Endl;
}
Expand Down
1 change: 1 addition & 0 deletions ydb/library/workload/benchmark_base/workload.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class TWorkloadGeneratorBase : public IWorkloadQueryGenerator {
virtual TString GetTablesYaml() const = 0;
virtual TSpecialDataTypes GetSpecialDataTypes() const = 0;
NJson::TJsonValue GetTablesJson() const;
virtual ui32 GetDefaultPartitionsCount(const TString& tableName) const;

THolder<TGeneratorStateProcessor> StateProcessor;
private:
Expand Down
4 changes: 3 additions & 1 deletion ydb/library/workload/tpc_base/tpc_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ void TTpcBaseWorkloadParams::ConfigureOpts(NLastGetopt::TOpts& opts, const EComm
.StoreResult(&ExternalQueriesDir);
opts.AddLongOption( "syntax", "Query syntax [" + GetEnumAllNames<EQuerySyntax>() + "].")
.StoreResult(&Syntax).DefaultValue(Syntax);
opts.AddLongOption("scale", "scale in percents")
opts.AddLongOption("scale", "Sets the percentage of the benchmark's data size and workload to use, relative to full scale.")
.DefaultValue(Scale).StoreResult(&Scale);
opts.AddLongOption("float-mode", "Float mode. Can be float, decimal or decimal_ydb. If set to 'float' - float will be used, 'decimal' means that decimal will be used with canonical size and 'decimal_ydb' means that all floats will be converted to decimal(22,9) because YDB supports only this type.")
.StoreResult(&FloatMode).DefaultValue(FloatMode);
Expand All @@ -177,6 +177,8 @@ void TTpcBaseWorkloadParams::ConfigureOpts(NLastGetopt::TOpts& opts, const EComm
case TWorkloadParams::ECommandType::Init:
opts.AddLongOption("float-mode", "Float mode. Can be float, decimal or decimal_ydb. If set to 'float' - float will be used, 'decimal' means that decimal will be used with canonical size and 'decimal_ydb' means that all floats will be converted to decimal(22,9) because YDB supports only this type.")
.StoreResult(&FloatMode).DefaultValue(FloatMode);
opts.AddLongOption("scale", "Sets the percentage of the benchmark's data size and workload to use, relative to full scale.")
.DefaultValue(Scale).StoreResult(&Scale);
break;
default:
break;
Expand Down
2 changes: 1 addition & 1 deletion ydb/library/workload/tpcds/data_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ TTpcdsWorkloadDataInitializerGenerator::TTpcdsWorkloadDataInitializerGenerator(c

void TTpcdsWorkloadDataInitializerGenerator::ConfigureOpts(NLastGetopt::TOpts& opts) {
TWorkloadDataInitializerBase::ConfigureOpts(opts);
opts.AddLongOption("scale", "scale in percents")
opts.AddLongOption("scale", "Sets the percentage of the benchmark's data size and workload to use, relative to full scale.")
.DefaultValue(Scale).StoreResult(&Scale);
opts.AddLongOption("tables", "Commaseparated list of tables for generate. Empty means all tables.\n"
"Enabled tables: " + JoinSeq(", ", TBulkDataGenerator::TFactory::GetRegisteredKeys()))
Expand Down
2 changes: 1 addition & 1 deletion ydb/library/workload/tpch/data_generator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ TTpchWorkloadDataInitializerGenerator::TTpchWorkloadDataInitializerGenerator(con

void TTpchWorkloadDataInitializerGenerator::ConfigureOpts(NLastGetopt::TOpts& opts) {
TWorkloadDataInitializerBase::ConfigureOpts(opts);
opts.AddLongOption("scale", "scale in percents")
opts.AddLongOption("scale", "Sets the percentage of the benchmark's data size and workload to use, relative to full scale.")
.DefaultValue(Scale).StoreResult(&Scale);
opts.AddLongOption("tables", "Commaseparated list of tables for generate. Empty means all tables.\n"
"Enabled tables: " + JoinSeq(", ", TBulkDataGenerator::TFactory::GetRegisteredKeys()))
Expand Down
4 changes: 4 additions & 0 deletions ydb/library/workload/tpch/tpch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ TWorkloadGeneratorBase::TSpecialDataTypes TTpchWorkloadGenerator::GetSpecialData
}
}

ui32 TTpchWorkloadGenerator::GetDefaultPartitionsCount(const TString& /*tableName*/) const {
return Params.GetScale() <= 10 ? 64 : 256;
}


THolder<IWorkloadQueryGenerator> TTpchWorkloadParams::CreateGenerator() const {
return MakeHolder<TTpchWorkloadGenerator>(*this);
Expand Down
1 change: 1 addition & 0 deletions ydb/library/workload/tpch/tpch.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class TTpchWorkloadGenerator final: public TTpcBaseWorkloadGenerator {
protected:
TString GetTablesYaml() const override;
TWorkloadGeneratorBase::TSpecialDataTypes GetSpecialDataTypes() const override;
ui32 GetDefaultPartitionsCount(const TString& tableName) const override;

private:
const TTpchWorkloadParams& Params;
Expand Down
2 changes: 2 additions & 0 deletions ydb/library/workload/tpch/tpch_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ tables:
primary_key:
- l_orderkey
- l_linenumber
partition_by:
- l_orderkey

- name: nation
columns:
Expand Down
6 changes: 6 additions & 0 deletions ydb/tests/functional/benchmarks_init/canondata/result.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
"test_init.TestClickbenchInit.test_s1_s3": {
"uri": "file://test_init.TestClickbenchInit.test_s1_s3/s1_s3"
},
"test_init.TestTpcdsInit.test_s100_column": {
"uri": "file://test_init.TestTpcdsInit.test_s100_column/s100_column"
},
"test_init.TestTpcdsInit.test_s1_column": {
"uri": "file://test_init.TestTpcdsInit.test_s1_column/s1_column"
},
Expand All @@ -47,6 +50,9 @@
"test_init.TestTpcdsInit.test_s1_s3": {
"uri": "file://test_init.TestTpcdsInit.test_s1_s3/s1_s3"
},
"test_init.TestTpchInit.test_s100_column": {
"uri": "file://test_init.TestTpchInit.test_s100_column/s100_column"
},
"test_init.TestTpchInit.test_s1_column": {
"uri": "file://test_init.TestTpchInit.test_s1_column/s1_column"
},
Expand Down
Loading
Loading