Skip to content

Commit

Permalink
Implement TPCH Query 2 in TpchQueryBuilder (#9825)
Browse files Browse the repository at this point in the history
Summary:
This PR introduces TPC-H Query 2 into the TpchQueryBuilder and extends the TpchBenchmark and ParquetTpchTest to include this query. Additionally, it provides a detailed performance comparison with DuckDB using the Parquet file format and includes the output of PrintPlanWithStats for detailed analysis.
Scaling Factor used is 1.
Here is the link to the PowerPoint presentation, which contains a detailed description for each driver and thread : https://ibm.box.com/s/sau464qdfac45aainwpj6pyvkvbtlsat

### Performance Comparison
  Chip: Apple M1 Pro
  Total Number of Cores: 10 (8 performance and 2 efficiency)
  Memory: 32 GB

The following table summarizes the performance comparison between Velox and DuckDB (with Parquet file format) across various numbers of threads/drivers:

| # Num Threads/ Drivers | Velox(ms) | DuckDB(ms) |
|:----------------------:|:---------:|:----------:|
|            1           |     27     |     88.4    |
|            4           |     23     |     84.1    |
|            8           |     25     |     82.8    |
|           16           |     30     |     84    |

Pull Request resolved: #9825

Reviewed By: bikramSingh91

Differential Revision: D58244304

Pulled By: kevinwilfong

fbshipit-source-id: 4d216b10b49847ab692a394783bd5c59a59c9eb2
  • Loading branch information
deepthydavis authored and facebook-github-bot committed Jun 7, 2024
1 parent b0c2f44 commit 20470a0
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 0 deletions.
5 changes: 5 additions & 0 deletions velox/benchmarks/tpch/TpchBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,11 @@ BENCHMARK(q1) {
benchmark.run(planContext);
}

BENCHMARK(q2) {
const auto planContext = queryBuilder->getQueryPlan(2);
benchmark.run(planContext);
}

BENCHMARK(q3) {
const auto planContext = queryBuilder->getQueryPlan(3);
benchmark.run(planContext);
Expand Down
5 changes: 5 additions & 0 deletions velox/dwio/parquet/tests/ParquetTpchTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,11 @@ TEST_F(ParquetTpchTest, Q1) {
assertQuery(1);
}

TEST_F(ParquetTpchTest, Q2) {
std::vector<uint32_t> sortingKeys{0, 1, 2, 3};
assertQuery(2, std::move(sortingKeys));
}

TEST_F(ParquetTpchTest, Q3) {
std::vector<uint32_t> sortingKeys{1, 2};
assertQuery(3, std::move(sortingKeys));
Expand Down
203 changes: 203 additions & 0 deletions velox/exec/tests/utils/TpchQueryBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ TpchPlan TpchQueryBuilder::getQueryPlan(int queryId) const {
switch (queryId) {
case 1:
return getQ1Plan();
case 2:
return getQ2Plan();
case 3:
return getQ3Plan();
case 5:
Expand Down Expand Up @@ -238,6 +240,207 @@ TpchPlan TpchQueryBuilder::getQ1Plan() const {
return context;
}

TpchPlan TpchQueryBuilder::getQ2Plan() const {
std::vector<std::string> supplierColumnsSubQuery = {
"s_suppkey", "s_nationkey"};
std::vector<std::string> nationColumnsSubQuery = {
"n_nationkey", "n_regionkey"};
std::vector<std::string> supplierColumns = {
"s_acctbal",
"s_name",
"s_address",
"s_phone",
"s_comment",
"s_suppkey",
"s_nationkey"};
std::vector<std::string> partColumns = {
"p_partkey", "p_mfgr", "p_size", "p_type"};
std::vector<std::string> partsuppColumns = {
"ps_partkey", "ps_suppkey", "ps_supplycost"};
std::vector<std::string> nationColumns = {
"n_nationkey", "n_name", "n_regionkey"};
std::vector<std::string> regionColumns = {"r_regionkey", "r_name"};

auto supplierSelectedRowTypeSubQuery =
getRowType(kSupplier, supplierColumnsSubQuery);
const auto& supplierFileColumnsSubQuery = getFileColumnNames(kSupplier);
auto nationSelectedRowTypeSubQuery =
getRowType(kNation, nationColumnsSubQuery);
const auto& nationFileColumnsSubQuery = getFileColumnNames(kNation);
auto partSelectedRowType = getRowType(kPart, partColumns);
const auto& partFileColumns = getFileColumnNames(kPart);
auto supplierSelectedRowType = getRowType(kSupplier, supplierColumns);
const auto& supplierFileColumns = getFileColumnNames(kSupplier);
auto partsuppSelectedRowType = getRowType(kPartsupp, partsuppColumns);
const auto& partsuppFileColumns = getFileColumnNames(kPartsupp);
auto nationSelectedRowType = getRowType(kNation, nationColumns);
const auto& nationFileColumns = getFileColumnNames(kNation);
auto regionSelectedRowType = getRowType(kRegion, regionColumns);
const auto& regionFileColumns = getFileColumnNames(kRegion);

const std::string regionNameFilter = "r_name = 'EUROPE'";

auto planNodeIdGenerator = std::make_shared<core::PlanNodeIdGenerator>();
core::PlanNodeId supplierScanIdSubQuery;
core::PlanNodeId partsuppScanIdSubQuery;
core::PlanNodeId nationScanIdSubQuery;
core::PlanNodeId regionScanIdSubQuery;
core::PlanNodeId partScanId;
core::PlanNodeId supplierScanId;
core::PlanNodeId partsuppScanId;
core::PlanNodeId nationScanId;
core::PlanNodeId regionScanId;

auto regionSubQuery = PlanBuilder(planNodeIdGenerator)
.tableScan(
kRegion,
regionSelectedRowType,
regionFileColumns,
{regionNameFilter})
.capturePlanNodeId(regionScanIdSubQuery)
.planNode();

auto nationJoinRegionSubQuery =
PlanBuilder(planNodeIdGenerator)
.tableScan(
kNation, nationSelectedRowTypeSubQuery, nationFileColumnsSubQuery)
.capturePlanNodeId(nationScanIdSubQuery)
.hashJoin(
{"n_regionkey"},
{"r_regionkey"},
regionSubQuery,
"",
{"n_nationkey"})
.planNode();

auto supplierJoinNationJoinRegionSubQuery =
PlanBuilder(planNodeIdGenerator)
.tableScan(
kSupplier,
supplierSelectedRowTypeSubQuery,
supplierFileColumnsSubQuery)
.capturePlanNodeId(supplierScanIdSubQuery)
.hashJoin(
{"s_nationkey"},
{"n_nationkey"},
nationJoinRegionSubQuery,
"",
{"s_suppkey"})
.planNode();

auto part = PlanBuilder(planNodeIdGenerator)
.tableScan(
kPart,
partSelectedRowType,
partFileColumns,
{},
"p_type like '%BRASS'")
.capturePlanNodeId(partScanId)
.filter("p_size = 15")
.planNode();

auto region = PlanBuilder(planNodeIdGenerator)
.tableScan(
kRegion,
regionSelectedRowType,
regionFileColumns,
{regionNameFilter})
.capturePlanNodeId(regionScanId)
.planNode();

auto nationJoinRegion =
PlanBuilder(planNodeIdGenerator)
.tableScan(kNation, nationSelectedRowType, nationFileColumns)
.capturePlanNodeId(nationScanId)
.hashJoin(
{"n_regionkey"},
{"r_regionkey"},
region,
"",
{"n_nationkey", "n_name"})
.planNode();

auto supplierJoinNationJoinRegion =
PlanBuilder(planNodeIdGenerator)
.tableScan(kSupplier, supplierSelectedRowType, supplierFileColumns)
.capturePlanNodeId(supplierScanId)
.hashJoin(
{"s_nationkey"},
{"n_nationkey"},
nationJoinRegion,
"",
mergeColumnNames(supplierColumns, {"s_suppkey", "n_name"}))
.planNode();

auto partsuppJoinPartJoinSupplierJoinNationJoinRegion =
PlanBuilder(planNodeIdGenerator)
.tableScan(kPartsupp, partsuppSelectedRowType, partsuppFileColumns)
.capturePlanNodeId(partsuppScanId)
.hashJoin(
{"ps_partkey"},
{"p_partkey"},
part,
"",
{"ps_suppkey", "ps_supplycost", "p_partkey", "p_mfgr"})
.hashJoin(
{"ps_suppkey"},
{"s_suppkey"},
supplierJoinNationJoinRegion,
"",
mergeColumnNames(
supplierColumns,
{"ps_supplycost", "p_partkey", "p_mfgr", "n_name"}))
.planNode();

auto plan =
PlanBuilder(planNodeIdGenerator)
.tableScan(kPartsupp, partsuppSelectedRowType, partsuppFileColumns)
.capturePlanNodeId(partsuppScanIdSubQuery)
.hashJoin(
{"ps_suppkey"},
{"s_suppkey"},
supplierJoinNationJoinRegionSubQuery,
"",
{"ps_supplycost", "ps_partkey"})
.partialAggregation(
{"ps_partkey"}, {"min(ps_supplycost) AS min_supplycost"})
.localPartition({"ps_partkey"})
.finalAggregation()
.hashJoin(
{"ps_partkey"},
{"p_partkey"},
partsuppJoinPartJoinSupplierJoinNationJoinRegion,
"ps_supplycost = min_supplycost",
mergeColumnNames(
supplierColumns, {"p_partkey", "p_mfgr", "n_name"}))
.orderBy({"s_acctbal DESC", "n_name", "s_name", "p_partkey"}, false)
.project(
{"s_acctbal",
"s_name",
"n_name",
"p_partkey",
"p_mfgr",
"s_address",
"s_phone",
"s_comment"})
.limit(0, 100, false)
.planNode();

TpchPlan context;
context.plan = std::move(plan);
context.dataFiles[supplierScanIdSubQuery] = getTableFilePaths(kSupplier);
context.dataFiles[partsuppScanIdSubQuery] = getTableFilePaths(kPartsupp);
context.dataFiles[nationScanIdSubQuery] = getTableFilePaths(kNation);
context.dataFiles[regionScanIdSubQuery] = getTableFilePaths(kRegion);
context.dataFiles[partScanId] = getTableFilePaths(kPart);
context.dataFiles[supplierScanId] = getTableFilePaths(kSupplier);
context.dataFiles[partsuppScanId] = getTableFilePaths(kPartsupp);
context.dataFiles[nationScanId] = getTableFilePaths(kNation);
context.dataFiles[regionScanId] = getTableFilePaths(kRegion);
context.dataFileFormat = format_;
return context;
}

TpchPlan TpchQueryBuilder::getQ3Plan() const {
std::vector<std::string> lineitemColumns = {
"l_shipdate", "l_orderkey", "l_extendedprice", "l_discount"};
Expand Down
1 change: 1 addition & 0 deletions velox/exec/tests/utils/TpchQueryBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ class TpchQueryBuilder {
const std::vector<std::string>& columns);

TpchPlan getQ1Plan() const;
TpchPlan getQ2Plan() const;
TpchPlan getQ3Plan() const;
TpchPlan getQ5Plan() const;
TpchPlan getQ6Plan() const;
Expand Down

0 comments on commit 20470a0

Please sign in to comment.