From 20470a09ce1f05676fabaf4f83e475e9ef16839a Mon Sep 17 00:00:00 2001 From: deepthydavis Date: Fri, 7 Jun 2024 10:19:43 -0700 Subject: [PATCH] Implement TPCH Query 2 in TpchQueryBuilder (#9825) Summary: This PR introduces TPC-H Query 2 into the TpchQueryBuilder and extends the TpchBenchmark and ParquetTpchTest to include this query. Additionally, it provides a detailed performance comparison with DuckDB using the Parquet file format and includes the output of PrintPlanWithStats for detailed analysis. Scaling Factor used is 1. Here is the link to the PowerPoint presentation, which contains a detailed description for each driver and thread : https://ibm.box.com/s/sau464qdfac45aainwpj6pyvkvbtlsat ### Performance Comparison Chip: Apple M1 Pro Total Number of Cores: 10 (8 performance and 2 efficiency) Memory: 32 GB The following table summarizes the performance comparison between Velox and DuckDB (with Parquet file format) across various numbers of threads/drivers: | # Num Threads/ Drivers | Velox(ms) | DuckDB(ms) | |:----------------------:|:---------:|:----------:| | 1 | 27 | 88.4 | | 4 | 23 | 84.1 | | 8 | 25 | 82.8 | | 16 | 30 | 84 | Pull Request resolved: https://github.com/facebookincubator/velox/pull/9825 Reviewed By: bikramSingh91 Differential Revision: D58244304 Pulled By: kevinwilfong fbshipit-source-id: 4d216b10b49847ab692a394783bd5c59a59c9eb2 --- velox/benchmarks/tpch/TpchBenchmark.cpp | 5 + velox/dwio/parquet/tests/ParquetTpchTest.cpp | 5 + velox/exec/tests/utils/TpchQueryBuilder.cpp | 203 +++++++++++++++++++ velox/exec/tests/utils/TpchQueryBuilder.h | 1 + 4 files changed, 214 insertions(+) diff --git a/velox/benchmarks/tpch/TpchBenchmark.cpp b/velox/benchmarks/tpch/TpchBenchmark.cpp index fb1862ab15f3..2405714ea906 100644 --- a/velox/benchmarks/tpch/TpchBenchmark.cpp +++ b/velox/benchmarks/tpch/TpchBenchmark.cpp @@ -494,6 +494,11 @@ BENCHMARK(q1) { benchmark.run(planContext); } +BENCHMARK(q2) { + const auto planContext = queryBuilder->getQueryPlan(2); + benchmark.run(planContext); +} + BENCHMARK(q3) { const auto planContext = queryBuilder->getQueryPlan(3); benchmark.run(planContext); diff --git a/velox/dwio/parquet/tests/ParquetTpchTest.cpp b/velox/dwio/parquet/tests/ParquetTpchTest.cpp index 47e406235ebe..b2a32db701ca 100644 --- a/velox/dwio/parquet/tests/ParquetTpchTest.cpp +++ b/velox/dwio/parquet/tests/ParquetTpchTest.cpp @@ -161,6 +161,11 @@ TEST_F(ParquetTpchTest, Q1) { assertQuery(1); } +TEST_F(ParquetTpchTest, Q2) { + std::vector sortingKeys{0, 1, 2, 3}; + assertQuery(2, std::move(sortingKeys)); +} + TEST_F(ParquetTpchTest, Q3) { std::vector sortingKeys{1, 2}; assertQuery(3, std::move(sortingKeys)); diff --git a/velox/exec/tests/utils/TpchQueryBuilder.cpp b/velox/exec/tests/utils/TpchQueryBuilder.cpp index e52e4f3b66f7..3c2d6de8fe64 100644 --- a/velox/exec/tests/utils/TpchQueryBuilder.cpp +++ b/velox/exec/tests/utils/TpchQueryBuilder.cpp @@ -142,6 +142,8 @@ TpchPlan TpchQueryBuilder::getQueryPlan(int queryId) const { switch (queryId) { case 1: return getQ1Plan(); + case 2: + return getQ2Plan(); case 3: return getQ3Plan(); case 5: @@ -238,6 +240,207 @@ TpchPlan TpchQueryBuilder::getQ1Plan() const { return context; } +TpchPlan TpchQueryBuilder::getQ2Plan() const { + std::vector supplierColumnsSubQuery = { + "s_suppkey", "s_nationkey"}; + std::vector nationColumnsSubQuery = { + "n_nationkey", "n_regionkey"}; + std::vector supplierColumns = { + "s_acctbal", + "s_name", + "s_address", + "s_phone", + "s_comment", + "s_suppkey", + "s_nationkey"}; + std::vector partColumns = { + "p_partkey", "p_mfgr", "p_size", "p_type"}; + std::vector partsuppColumns = { + "ps_partkey", "ps_suppkey", "ps_supplycost"}; + std::vector nationColumns = { + "n_nationkey", "n_name", "n_regionkey"}; + std::vector regionColumns = {"r_regionkey", "r_name"}; + + auto supplierSelectedRowTypeSubQuery = + getRowType(kSupplier, supplierColumnsSubQuery); + const auto& supplierFileColumnsSubQuery = getFileColumnNames(kSupplier); + auto nationSelectedRowTypeSubQuery = + getRowType(kNation, nationColumnsSubQuery); + const auto& nationFileColumnsSubQuery = getFileColumnNames(kNation); + auto partSelectedRowType = getRowType(kPart, partColumns); + const auto& partFileColumns = getFileColumnNames(kPart); + auto supplierSelectedRowType = getRowType(kSupplier, supplierColumns); + const auto& supplierFileColumns = getFileColumnNames(kSupplier); + auto partsuppSelectedRowType = getRowType(kPartsupp, partsuppColumns); + const auto& partsuppFileColumns = getFileColumnNames(kPartsupp); + auto nationSelectedRowType = getRowType(kNation, nationColumns); + const auto& nationFileColumns = getFileColumnNames(kNation); + auto regionSelectedRowType = getRowType(kRegion, regionColumns); + const auto& regionFileColumns = getFileColumnNames(kRegion); + + const std::string regionNameFilter = "r_name = 'EUROPE'"; + + auto planNodeIdGenerator = std::make_shared(); + core::PlanNodeId supplierScanIdSubQuery; + core::PlanNodeId partsuppScanIdSubQuery; + core::PlanNodeId nationScanIdSubQuery; + core::PlanNodeId regionScanIdSubQuery; + core::PlanNodeId partScanId; + core::PlanNodeId supplierScanId; + core::PlanNodeId partsuppScanId; + core::PlanNodeId nationScanId; + core::PlanNodeId regionScanId; + + auto regionSubQuery = PlanBuilder(planNodeIdGenerator) + .tableScan( + kRegion, + regionSelectedRowType, + regionFileColumns, + {regionNameFilter}) + .capturePlanNodeId(regionScanIdSubQuery) + .planNode(); + + auto nationJoinRegionSubQuery = + PlanBuilder(planNodeIdGenerator) + .tableScan( + kNation, nationSelectedRowTypeSubQuery, nationFileColumnsSubQuery) + .capturePlanNodeId(nationScanIdSubQuery) + .hashJoin( + {"n_regionkey"}, + {"r_regionkey"}, + regionSubQuery, + "", + {"n_nationkey"}) + .planNode(); + + auto supplierJoinNationJoinRegionSubQuery = + PlanBuilder(planNodeIdGenerator) + .tableScan( + kSupplier, + supplierSelectedRowTypeSubQuery, + supplierFileColumnsSubQuery) + .capturePlanNodeId(supplierScanIdSubQuery) + .hashJoin( + {"s_nationkey"}, + {"n_nationkey"}, + nationJoinRegionSubQuery, + "", + {"s_suppkey"}) + .planNode(); + + auto part = PlanBuilder(planNodeIdGenerator) + .tableScan( + kPart, + partSelectedRowType, + partFileColumns, + {}, + "p_type like '%BRASS'") + .capturePlanNodeId(partScanId) + .filter("p_size = 15") + .planNode(); + + auto region = PlanBuilder(planNodeIdGenerator) + .tableScan( + kRegion, + regionSelectedRowType, + regionFileColumns, + {regionNameFilter}) + .capturePlanNodeId(regionScanId) + .planNode(); + + auto nationJoinRegion = + PlanBuilder(planNodeIdGenerator) + .tableScan(kNation, nationSelectedRowType, nationFileColumns) + .capturePlanNodeId(nationScanId) + .hashJoin( + {"n_regionkey"}, + {"r_regionkey"}, + region, + "", + {"n_nationkey", "n_name"}) + .planNode(); + + auto supplierJoinNationJoinRegion = + PlanBuilder(planNodeIdGenerator) + .tableScan(kSupplier, supplierSelectedRowType, supplierFileColumns) + .capturePlanNodeId(supplierScanId) + .hashJoin( + {"s_nationkey"}, + {"n_nationkey"}, + nationJoinRegion, + "", + mergeColumnNames(supplierColumns, {"s_suppkey", "n_name"})) + .planNode(); + + auto partsuppJoinPartJoinSupplierJoinNationJoinRegion = + PlanBuilder(planNodeIdGenerator) + .tableScan(kPartsupp, partsuppSelectedRowType, partsuppFileColumns) + .capturePlanNodeId(partsuppScanId) + .hashJoin( + {"ps_partkey"}, + {"p_partkey"}, + part, + "", + {"ps_suppkey", "ps_supplycost", "p_partkey", "p_mfgr"}) + .hashJoin( + {"ps_suppkey"}, + {"s_suppkey"}, + supplierJoinNationJoinRegion, + "", + mergeColumnNames( + supplierColumns, + {"ps_supplycost", "p_partkey", "p_mfgr", "n_name"})) + .planNode(); + + auto plan = + PlanBuilder(planNodeIdGenerator) + .tableScan(kPartsupp, partsuppSelectedRowType, partsuppFileColumns) + .capturePlanNodeId(partsuppScanIdSubQuery) + .hashJoin( + {"ps_suppkey"}, + {"s_suppkey"}, + supplierJoinNationJoinRegionSubQuery, + "", + {"ps_supplycost", "ps_partkey"}) + .partialAggregation( + {"ps_partkey"}, {"min(ps_supplycost) AS min_supplycost"}) + .localPartition({"ps_partkey"}) + .finalAggregation() + .hashJoin( + {"ps_partkey"}, + {"p_partkey"}, + partsuppJoinPartJoinSupplierJoinNationJoinRegion, + "ps_supplycost = min_supplycost", + mergeColumnNames( + supplierColumns, {"p_partkey", "p_mfgr", "n_name"})) + .orderBy({"s_acctbal DESC", "n_name", "s_name", "p_partkey"}, false) + .project( + {"s_acctbal", + "s_name", + "n_name", + "p_partkey", + "p_mfgr", + "s_address", + "s_phone", + "s_comment"}) + .limit(0, 100, false) + .planNode(); + + TpchPlan context; + context.plan = std::move(plan); + context.dataFiles[supplierScanIdSubQuery] = getTableFilePaths(kSupplier); + context.dataFiles[partsuppScanIdSubQuery] = getTableFilePaths(kPartsupp); + context.dataFiles[nationScanIdSubQuery] = getTableFilePaths(kNation); + context.dataFiles[regionScanIdSubQuery] = getTableFilePaths(kRegion); + context.dataFiles[partScanId] = getTableFilePaths(kPart); + context.dataFiles[supplierScanId] = getTableFilePaths(kSupplier); + context.dataFiles[partsuppScanId] = getTableFilePaths(kPartsupp); + context.dataFiles[nationScanId] = getTableFilePaths(kNation); + context.dataFiles[regionScanId] = getTableFilePaths(kRegion); + context.dataFileFormat = format_; + return context; +} + TpchPlan TpchQueryBuilder::getQ3Plan() const { std::vector lineitemColumns = { "l_shipdate", "l_orderkey", "l_extendedprice", "l_discount"}; diff --git a/velox/exec/tests/utils/TpchQueryBuilder.h b/velox/exec/tests/utils/TpchQueryBuilder.h index 4f4227247ff0..c1ad1a970031 100644 --- a/velox/exec/tests/utils/TpchQueryBuilder.h +++ b/velox/exec/tests/utils/TpchQueryBuilder.h @@ -91,6 +91,7 @@ class TpchQueryBuilder { const std::vector& columns); TpchPlan getQ1Plan() const; + TpchPlan getQ2Plan() const; TpchPlan getQ3Plan() const; TpchPlan getQ5Plan() const; TpchPlan getQ6Plan() const;