From 77f330c6a2b26f2d1d4d4bf11d456fad466316b4 Mon Sep 17 00:00:00 2001 From: Austin Liu Date: Fri, 4 Oct 2024 03:47:46 +0800 Subject: [PATCH 01/22] Add IMDB(JOB) Benchmark [2/N] (imdb queries) (#12529) * imdb dataset * cargo fmt * Add 113 queries for IMDB(JOB) Signed-off-by: Austin Liu * Add `get_query_sql` from `query_id` string Signed-off-by: Austin Liu * Fix CSV reader & Remove Parquet partition Signed-off-by: Austin Liu * Add benchmark IMDB runner Signed-off-by: Austin Liu * Add `run_imdb` script Signed-off-by: Austin Liu * Add checker for imdb option Signed-off-by: Austin Liu * Add SLT for IMDB Signed-off-by: Austin Liu * Fix `get_query_sql()` for CI roundtrip test Signed-off-by: Austin Liu Fix `get_query_sql()` for CI roundtrip test Signed-off-by: Austin Liu Fix `get_query_sql()` for CI roundtrip test Signed-off-by: Austin Liu * Clean up Signed-off-by: Austin Liu * Add missing license Signed-off-by: Austin Liu * Add IMDB(JOB) queries `2b` to `5c` Signed-off-by: Austin Liu * Add `INCLUDE_IMDB` in CI verify-benchmark-results Signed-off-by: Austin Liu * Prepare IMDB dataset Signed-off-by: Austin Liu Prepare IMDB dataset Signed-off-by: Austin Liu * use uint as id type * format * Seperate `tpch` and `imdb` benchmarking CI jobs Signed-off-by: Austin Liu Fix path Signed-off-by: Austin Liu Fix path Signed-off-by: Austin Liu Remove `tpch` in `imdb` benchmark Signed-off-by: Austin Liu * Remove IMDB(JOB) slt in CI Signed-off-by: Austin Liu Remove IMDB(JOB) slt in CI Signed-off-by: Austin Liu --------- Signed-off-by: Austin Liu Co-authored-by: DouPache --- benchmarks/bench.sh | 14 + benchmarks/queries/imdb/10a.sql | 1 + benchmarks/queries/imdb/10b.sql | 1 + benchmarks/queries/imdb/10c.sql | 1 + benchmarks/queries/imdb/11a.sql | 1 + benchmarks/queries/imdb/11b.sql | 1 + benchmarks/queries/imdb/11c.sql | 1 + benchmarks/queries/imdb/11d.sql | 1 + benchmarks/queries/imdb/12a.sql | 1 + benchmarks/queries/imdb/12b.sql | 1 + benchmarks/queries/imdb/12c.sql | 1 + benchmarks/queries/imdb/13a.sql | 1 + benchmarks/queries/imdb/13b.sql | 1 + benchmarks/queries/imdb/13c.sql | 1 + benchmarks/queries/imdb/13d.sql | 1 + benchmarks/queries/imdb/14a.sql | 1 + benchmarks/queries/imdb/14b.sql | 1 + benchmarks/queries/imdb/14c.sql | 1 + benchmarks/queries/imdb/15a.sql | 1 + benchmarks/queries/imdb/15b.sql | 1 + benchmarks/queries/imdb/15c.sql | 1 + benchmarks/queries/imdb/15d.sql | 1 + benchmarks/queries/imdb/16a.sql | 1 + benchmarks/queries/imdb/16b.sql | 1 + benchmarks/queries/imdb/16c.sql | 1 + benchmarks/queries/imdb/16d.sql | 1 + benchmarks/queries/imdb/17a.sql | 1 + benchmarks/queries/imdb/17b.sql | 1 + benchmarks/queries/imdb/17c.sql | 1 + benchmarks/queries/imdb/17d.sql | 1 + benchmarks/queries/imdb/17e.sql | 1 + benchmarks/queries/imdb/17f.sql | 1 + benchmarks/queries/imdb/18a.sql | 1 + benchmarks/queries/imdb/18b.sql | 1 + benchmarks/queries/imdb/18c.sql | 1 + benchmarks/queries/imdb/19a.sql | 1 + benchmarks/queries/imdb/19b.sql | 1 + benchmarks/queries/imdb/19c.sql | 1 + benchmarks/queries/imdb/19d.sql | 1 + benchmarks/queries/imdb/1a.sql | 1 + benchmarks/queries/imdb/1b.sql | 1 + benchmarks/queries/imdb/1c.sql | 1 + benchmarks/queries/imdb/1d.sql | 1 + benchmarks/queries/imdb/20a.sql | 1 + benchmarks/queries/imdb/20b.sql | 1 + benchmarks/queries/imdb/20c.sql | 1 + benchmarks/queries/imdb/21a.sql | 1 + benchmarks/queries/imdb/21b.sql | 1 + benchmarks/queries/imdb/21c.sql | 1 + benchmarks/queries/imdb/22a.sql | 1 + benchmarks/queries/imdb/22b.sql | 1 + benchmarks/queries/imdb/22c.sql | 1 + benchmarks/queries/imdb/22d.sql | 1 + benchmarks/queries/imdb/23a.sql | 1 + benchmarks/queries/imdb/23b.sql | 1 + benchmarks/queries/imdb/23c.sql | 1 + benchmarks/queries/imdb/24a.sql | 1 + benchmarks/queries/imdb/24b.sql | 1 + benchmarks/queries/imdb/25a.sql | 1 + benchmarks/queries/imdb/25b.sql | 1 + benchmarks/queries/imdb/25c.sql | 1 + benchmarks/queries/imdb/26a.sql | 1 + benchmarks/queries/imdb/26b.sql | 1 + benchmarks/queries/imdb/26c.sql | 1 + benchmarks/queries/imdb/27a.sql | 1 + benchmarks/queries/imdb/27b.sql | 1 + benchmarks/queries/imdb/27c.sql | 1 + benchmarks/queries/imdb/28a.sql | 1 + benchmarks/queries/imdb/28b.sql | 1 + benchmarks/queries/imdb/28c.sql | 1 + benchmarks/queries/imdb/29a.sql | 1 + benchmarks/queries/imdb/29b.sql | 1 + benchmarks/queries/imdb/29c.sql | 1 + benchmarks/queries/imdb/2a.sql | 1 + benchmarks/queries/imdb/2b.sql | 1 + benchmarks/queries/imdb/2c.sql | 1 + benchmarks/queries/imdb/2d.sql | 1 + benchmarks/queries/imdb/30a.sql | 1 + benchmarks/queries/imdb/30b.sql | 1 + benchmarks/queries/imdb/30c.sql | 1 + benchmarks/queries/imdb/31a.sql | 1 + benchmarks/queries/imdb/31b.sql | 1 + benchmarks/queries/imdb/31c.sql | 1 + benchmarks/queries/imdb/32a.sql | 1 + benchmarks/queries/imdb/32b.sql | 1 + benchmarks/queries/imdb/33a.sql | 1 + benchmarks/queries/imdb/33b.sql | 1 + benchmarks/queries/imdb/33c.sql | 1 + benchmarks/queries/imdb/3a.sql | 1 + benchmarks/queries/imdb/3b.sql | 1 + benchmarks/queries/imdb/3c.sql | 1 + benchmarks/queries/imdb/4a.sql | 1 + benchmarks/queries/imdb/4b.sql | 1 + benchmarks/queries/imdb/4c.sql | 1 + benchmarks/queries/imdb/5a.sql | 1 + benchmarks/queries/imdb/5b.sql | 1 + benchmarks/queries/imdb/5c.sql | 1 + benchmarks/queries/imdb/6a.sql | 1 + benchmarks/queries/imdb/6b.sql | 1 + benchmarks/queries/imdb/6c.sql | 1 + benchmarks/queries/imdb/6d.sql | 1 + benchmarks/queries/imdb/6e.sql | 1 + benchmarks/queries/imdb/6f.sql | 1 + benchmarks/queries/imdb/7a.sql | 1 + benchmarks/queries/imdb/7b.sql | 1 + benchmarks/queries/imdb/7c.sql | 1 + benchmarks/queries/imdb/8a.sql | 1 + benchmarks/queries/imdb/8b.sql | 1 + benchmarks/queries/imdb/8c.sql | 1 + benchmarks/queries/imdb/8d.sql | 1 + benchmarks/queries/imdb/9a.sql | 1 + benchmarks/queries/imdb/9b.sql | 1 + benchmarks/queries/imdb/9c.sql | 1 + benchmarks/queries/imdb/9d.sql | 1 + benchmarks/src/bin/dfbench.rs | 4 +- benchmarks/src/bin/imdb.rs | 11 + benchmarks/src/imdb/convert.rs | 6 +- benchmarks/src/imdb/mod.rs | 75 ++- benchmarks/src/imdb/run.rs | 827 ++++++++++++++++++++++++++++++++ 119 files changed, 1023 insertions(+), 27 deletions(-) create mode 100644 benchmarks/queries/imdb/10a.sql create mode 100644 benchmarks/queries/imdb/10b.sql create mode 100644 benchmarks/queries/imdb/10c.sql create mode 100644 benchmarks/queries/imdb/11a.sql create mode 100644 benchmarks/queries/imdb/11b.sql create mode 100644 benchmarks/queries/imdb/11c.sql create mode 100644 benchmarks/queries/imdb/11d.sql create mode 100644 benchmarks/queries/imdb/12a.sql create mode 100644 benchmarks/queries/imdb/12b.sql create mode 100644 benchmarks/queries/imdb/12c.sql create mode 100644 benchmarks/queries/imdb/13a.sql create mode 100644 benchmarks/queries/imdb/13b.sql create mode 100644 benchmarks/queries/imdb/13c.sql create mode 100644 benchmarks/queries/imdb/13d.sql create mode 100644 benchmarks/queries/imdb/14a.sql create mode 100644 benchmarks/queries/imdb/14b.sql create mode 100644 benchmarks/queries/imdb/14c.sql create mode 100644 benchmarks/queries/imdb/15a.sql create mode 100644 benchmarks/queries/imdb/15b.sql create mode 100644 benchmarks/queries/imdb/15c.sql create mode 100644 benchmarks/queries/imdb/15d.sql create mode 100644 benchmarks/queries/imdb/16a.sql create mode 100644 benchmarks/queries/imdb/16b.sql create mode 100644 benchmarks/queries/imdb/16c.sql create mode 100644 benchmarks/queries/imdb/16d.sql create mode 100644 benchmarks/queries/imdb/17a.sql create mode 100644 benchmarks/queries/imdb/17b.sql create mode 100644 benchmarks/queries/imdb/17c.sql create mode 100644 benchmarks/queries/imdb/17d.sql create mode 100644 benchmarks/queries/imdb/17e.sql create mode 100644 benchmarks/queries/imdb/17f.sql create mode 100644 benchmarks/queries/imdb/18a.sql create mode 100644 benchmarks/queries/imdb/18b.sql create mode 100644 benchmarks/queries/imdb/18c.sql create mode 100644 benchmarks/queries/imdb/19a.sql create mode 100644 benchmarks/queries/imdb/19b.sql create mode 100644 benchmarks/queries/imdb/19c.sql create mode 100644 benchmarks/queries/imdb/19d.sql create mode 100644 benchmarks/queries/imdb/1a.sql create mode 100644 benchmarks/queries/imdb/1b.sql create mode 100644 benchmarks/queries/imdb/1c.sql create mode 100644 benchmarks/queries/imdb/1d.sql create mode 100644 benchmarks/queries/imdb/20a.sql create mode 100644 benchmarks/queries/imdb/20b.sql create mode 100644 benchmarks/queries/imdb/20c.sql create mode 100644 benchmarks/queries/imdb/21a.sql create mode 100644 benchmarks/queries/imdb/21b.sql create mode 100644 benchmarks/queries/imdb/21c.sql create mode 100644 benchmarks/queries/imdb/22a.sql create mode 100644 benchmarks/queries/imdb/22b.sql create mode 100644 benchmarks/queries/imdb/22c.sql create mode 100644 benchmarks/queries/imdb/22d.sql create mode 100644 benchmarks/queries/imdb/23a.sql create mode 100644 benchmarks/queries/imdb/23b.sql create mode 100644 benchmarks/queries/imdb/23c.sql create mode 100644 benchmarks/queries/imdb/24a.sql create mode 100644 benchmarks/queries/imdb/24b.sql create mode 100644 benchmarks/queries/imdb/25a.sql create mode 100644 benchmarks/queries/imdb/25b.sql create mode 100644 benchmarks/queries/imdb/25c.sql create mode 100644 benchmarks/queries/imdb/26a.sql create mode 100644 benchmarks/queries/imdb/26b.sql create mode 100644 benchmarks/queries/imdb/26c.sql create mode 100644 benchmarks/queries/imdb/27a.sql create mode 100644 benchmarks/queries/imdb/27b.sql create mode 100644 benchmarks/queries/imdb/27c.sql create mode 100644 benchmarks/queries/imdb/28a.sql create mode 100644 benchmarks/queries/imdb/28b.sql create mode 100644 benchmarks/queries/imdb/28c.sql create mode 100644 benchmarks/queries/imdb/29a.sql create mode 100644 benchmarks/queries/imdb/29b.sql create mode 100644 benchmarks/queries/imdb/29c.sql create mode 100644 benchmarks/queries/imdb/2a.sql create mode 100644 benchmarks/queries/imdb/2b.sql create mode 100644 benchmarks/queries/imdb/2c.sql create mode 100644 benchmarks/queries/imdb/2d.sql create mode 100644 benchmarks/queries/imdb/30a.sql create mode 100644 benchmarks/queries/imdb/30b.sql create mode 100644 benchmarks/queries/imdb/30c.sql create mode 100644 benchmarks/queries/imdb/31a.sql create mode 100644 benchmarks/queries/imdb/31b.sql create mode 100644 benchmarks/queries/imdb/31c.sql create mode 100644 benchmarks/queries/imdb/32a.sql create mode 100644 benchmarks/queries/imdb/32b.sql create mode 100644 benchmarks/queries/imdb/33a.sql create mode 100644 benchmarks/queries/imdb/33b.sql create mode 100644 benchmarks/queries/imdb/33c.sql create mode 100644 benchmarks/queries/imdb/3a.sql create mode 100644 benchmarks/queries/imdb/3b.sql create mode 100644 benchmarks/queries/imdb/3c.sql create mode 100644 benchmarks/queries/imdb/4a.sql create mode 100644 benchmarks/queries/imdb/4b.sql create mode 100644 benchmarks/queries/imdb/4c.sql create mode 100644 benchmarks/queries/imdb/5a.sql create mode 100644 benchmarks/queries/imdb/5b.sql create mode 100644 benchmarks/queries/imdb/5c.sql create mode 100644 benchmarks/queries/imdb/6a.sql create mode 100644 benchmarks/queries/imdb/6b.sql create mode 100644 benchmarks/queries/imdb/6c.sql create mode 100644 benchmarks/queries/imdb/6d.sql create mode 100644 benchmarks/queries/imdb/6e.sql create mode 100644 benchmarks/queries/imdb/6f.sql create mode 100644 benchmarks/queries/imdb/7a.sql create mode 100644 benchmarks/queries/imdb/7b.sql create mode 100644 benchmarks/queries/imdb/7c.sql create mode 100644 benchmarks/queries/imdb/8a.sql create mode 100644 benchmarks/queries/imdb/8b.sql create mode 100644 benchmarks/queries/imdb/8c.sql create mode 100644 benchmarks/queries/imdb/8d.sql create mode 100644 benchmarks/queries/imdb/9a.sql create mode 100644 benchmarks/queries/imdb/9b.sql create mode 100644 benchmarks/queries/imdb/9c.sql create mode 100644 benchmarks/queries/imdb/9d.sql create mode 100644 benchmarks/src/imdb/run.rs diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh index 24efab6c6ca5..70faa9ef2b73 100755 --- a/benchmarks/bench.sh +++ b/benchmarks/bench.sh @@ -211,6 +211,7 @@ main() { run_clickbench_1 run_clickbench_partitioned run_clickbench_extended + run_imdb ;; tpch) run_tpch "1" @@ -239,6 +240,9 @@ main() { clickbench_extended) run_clickbench_extended ;; + imdb) + run_imdb + ;; *) echo "Error: unknown benchmark '$BENCHMARK' for run" usage @@ -510,6 +514,16 @@ data_imdb() { fi } +# Runs the imdb benchmark +run_imdb() { + IMDB_DIR="${DATA_DIR}/imdb" + + RESULTS_FILE="${RESULTS_DIR}/imdb.json" + echo "RESULTS_FILE: ${RESULTS_FILE}" + echo "Running imdb benchmark..." + $CARGO_COMMAND --bin imdb -- benchmark datafusion --iterations 5 --path "${IMDB_DIR}" --prefer_hash_join "${PREFER_HASH_JOIN}" --format parquet -o "${RESULTS_FILE}" +} + diff --git a/benchmarks/queries/imdb/10a.sql b/benchmarks/queries/imdb/10a.sql new file mode 100644 index 000000000000..95b049b77479 --- /dev/null +++ b/benchmarks/queries/imdb/10a.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS uncredited_voiced_character, MIN(t.title) AS russian_movie FROM char_name AS chn, cast_info AS ci, company_name AS cn, company_type AS ct, movie_companies AS mc, role_type AS rt, title AS t WHERE ci.note like '%(voice)%' and ci.note like '%(uncredited)%' AND cn.country_code = '[ru]' AND rt.role = 'actor' AND t.production_year > 2005 AND t.id = mc.movie_id AND t.id = ci.movie_id AND ci.movie_id = mc.movie_id AND chn.id = ci.person_role_id AND rt.id = ci.role_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/10b.sql b/benchmarks/queries/imdb/10b.sql new file mode 100644 index 000000000000..c32153631412 --- /dev/null +++ b/benchmarks/queries/imdb/10b.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS character, MIN(t.title) AS russian_mov_with_actor_producer FROM char_name AS chn, cast_info AS ci, company_name AS cn, company_type AS ct, movie_companies AS mc, role_type AS rt, title AS t WHERE ci.note like '%(producer)%' AND cn.country_code = '[ru]' AND rt.role = 'actor' AND t.production_year > 2010 AND t.id = mc.movie_id AND t.id = ci.movie_id AND ci.movie_id = mc.movie_id AND chn.id = ci.person_role_id AND rt.id = ci.role_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/10c.sql b/benchmarks/queries/imdb/10c.sql new file mode 100644 index 000000000000..b862cf4fa7ac --- /dev/null +++ b/benchmarks/queries/imdb/10c.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS character, MIN(t.title) AS movie_with_american_producer FROM char_name AS chn, cast_info AS ci, company_name AS cn, company_type AS ct, movie_companies AS mc, role_type AS rt, title AS t WHERE ci.note like '%(producer)%' AND cn.country_code = '[us]' AND t.production_year > 1990 AND t.id = mc.movie_id AND t.id = ci.movie_id AND ci.movie_id = mc.movie_id AND chn.id = ci.person_role_id AND rt.id = ci.role_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/11a.sql b/benchmarks/queries/imdb/11a.sql new file mode 100644 index 000000000000..f835968e900b --- /dev/null +++ b/benchmarks/queries/imdb/11a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS from_company, MIN(lt.link) AS movie_link_type, MIN(t.title) AS non_polish_sequel_movie FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND t.production_year BETWEEN 1950 AND 2000 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/11b.sql b/benchmarks/queries/imdb/11b.sql new file mode 100644 index 000000000000..2411e19ea608 --- /dev/null +++ b/benchmarks/queries/imdb/11b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS from_company, MIN(lt.link) AS movie_link_type, MIN(t.title) AS sequel_movie FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follows%' AND mc.note IS NULL AND t.production_year = 1998 and t.title like '%Money%' AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/11c.sql b/benchmarks/queries/imdb/11c.sql new file mode 100644 index 000000000000..3bf794678918 --- /dev/null +++ b/benchmarks/queries/imdb/11c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS from_company, MIN(mc.note) AS production_note, MIN(t.title) AS movie_based_on_book FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' and (cn.name like '20th Century Fox%' or cn.name like 'Twentieth Century Fox%') AND ct.kind != 'production companies' and ct.kind is not NULL AND k.keyword in ('sequel', 'revenge', 'based-on-novel') AND mc.note is not NULL AND t.production_year > 1950 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/11d.sql b/benchmarks/queries/imdb/11d.sql new file mode 100644 index 000000000000..0bc33e1d6e88 --- /dev/null +++ b/benchmarks/queries/imdb/11d.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS from_company, MIN(mc.note) AS production_note, MIN(t.title) AS movie_based_on_book FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND ct.kind != 'production companies' and ct.kind is not NULL AND k.keyword in ('sequel', 'revenge', 'based-on-novel') AND mc.note is not NULL AND t.production_year > 1950 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/12a.sql b/benchmarks/queries/imdb/12a.sql new file mode 100644 index 000000000000..22add74bd55d --- /dev/null +++ b/benchmarks/queries/imdb/12a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS drama_horror_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, title AS t WHERE cn.country_code = '[us]' AND ct.kind = 'production companies' AND it1.info = 'genres' AND it2.info = 'rating' AND mi.info in ('Drama', 'Horror') AND mi_idx.info > '8.0' AND t.production_year between 2005 and 2008 AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND mi.info_type_id = it1.id AND mi_idx.info_type_id = it2.id AND t.id = mc.movie_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id; diff --git a/benchmarks/queries/imdb/12b.sql b/benchmarks/queries/imdb/12b.sql new file mode 100644 index 000000000000..fc30ad550d10 --- /dev/null +++ b/benchmarks/queries/imdb/12b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS budget, MIN(t.title) AS unsuccsessful_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, title AS t WHERE cn.country_code ='[us]' AND ct.kind is not NULL and (ct.kind ='production companies' or ct.kind = 'distributors') AND it1.info ='budget' AND it2.info ='bottom 10 rank' AND t.production_year >2000 AND (t.title LIKE 'Birdemic%' OR t.title LIKE '%Movie%') AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND mi.info_type_id = it1.id AND mi_idx.info_type_id = it2.id AND t.id = mc.movie_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id; diff --git a/benchmarks/queries/imdb/12c.sql b/benchmarks/queries/imdb/12c.sql new file mode 100644 index 000000000000..64a340b2381e --- /dev/null +++ b/benchmarks/queries/imdb/12c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS mainstream_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, title AS t WHERE cn.country_code = '[us]' AND ct.kind = 'production companies' AND it1.info = 'genres' AND it2.info = 'rating' AND mi.info in ('Drama', 'Horror', 'Western', 'Family') AND mi_idx.info > '7.0' AND t.production_year between 2000 and 2010 AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND mi.info_type_id = it1.id AND mi_idx.info_type_id = it2.id AND t.id = mc.movie_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id; diff --git a/benchmarks/queries/imdb/13a.sql b/benchmarks/queries/imdb/13a.sql new file mode 100644 index 000000000000..95eb439d1e22 --- /dev/null +++ b/benchmarks/queries/imdb/13a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS release_date, MIN(miidx.info) AS rating, MIN(t.title) AS german_movie FROM company_name AS cn, company_type AS ct, info_type AS it, info_type AS it2, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS miidx, title AS t WHERE cn.country_code ='[de]' AND ct.kind ='production companies' AND it.info ='rating' AND it2.info ='release dates' AND kt.kind ='movie' AND mi.movie_id = t.id AND it2.id = mi.info_type_id AND kt.id = t.kind_id AND mc.movie_id = t.id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND miidx.movie_id = t.id AND it.id = miidx.info_type_id AND mi.movie_id = miidx.movie_id AND mi.movie_id = mc.movie_id AND miidx.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/13b.sql b/benchmarks/queries/imdb/13b.sql new file mode 100644 index 000000000000..4b6f75ab0ae6 --- /dev/null +++ b/benchmarks/queries/imdb/13b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(miidx.info) AS rating, MIN(t.title) AS movie_about_winning FROM company_name AS cn, company_type AS ct, info_type AS it, info_type AS it2, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS miidx, title AS t WHERE cn.country_code ='[us]' AND ct.kind ='production companies' AND it.info ='rating' AND it2.info ='release dates' AND kt.kind ='movie' AND t.title != '' AND (t.title LIKE '%Champion%' OR t.title LIKE '%Loser%') AND mi.movie_id = t.id AND it2.id = mi.info_type_id AND kt.id = t.kind_id AND mc.movie_id = t.id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND miidx.movie_id = t.id AND it.id = miidx.info_type_id AND mi.movie_id = miidx.movie_id AND mi.movie_id = mc.movie_id AND miidx.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/13c.sql b/benchmarks/queries/imdb/13c.sql new file mode 100644 index 000000000000..9e8c92327bd5 --- /dev/null +++ b/benchmarks/queries/imdb/13c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(miidx.info) AS rating, MIN(t.title) AS movie_about_winning FROM company_name AS cn, company_type AS ct, info_type AS it, info_type AS it2, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS miidx, title AS t WHERE cn.country_code ='[us]' AND ct.kind ='production companies' AND it.info ='rating' AND it2.info ='release dates' AND kt.kind ='movie' AND t.title != '' AND (t.title LIKE 'Champion%' OR t.title LIKE 'Loser%') AND mi.movie_id = t.id AND it2.id = mi.info_type_id AND kt.id = t.kind_id AND mc.movie_id = t.id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND miidx.movie_id = t.id AND it.id = miidx.info_type_id AND mi.movie_id = miidx.movie_id AND mi.movie_id = mc.movie_id AND miidx.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/13d.sql b/benchmarks/queries/imdb/13d.sql new file mode 100644 index 000000000000..a8bc567cabe1 --- /dev/null +++ b/benchmarks/queries/imdb/13d.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(miidx.info) AS rating, MIN(t.title) AS movie FROM company_name AS cn, company_type AS ct, info_type AS it, info_type AS it2, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS miidx, title AS t WHERE cn.country_code ='[us]' AND ct.kind ='production companies' AND it.info ='rating' AND it2.info ='release dates' AND kt.kind ='movie' AND mi.movie_id = t.id AND it2.id = mi.info_type_id AND kt.id = t.kind_id AND mc.movie_id = t.id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND miidx.movie_id = t.id AND it.id = miidx.info_type_id AND mi.movie_id = miidx.movie_id AND mi.movie_id = mc.movie_id AND miidx.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/14a.sql b/benchmarks/queries/imdb/14a.sql new file mode 100644 index 000000000000..af1a7c8983a6 --- /dev/null +++ b/benchmarks/queries/imdb/14a.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS northern_dark_movie FROM info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind = 'movie' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2010 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/14b.sql b/benchmarks/queries/imdb/14b.sql new file mode 100644 index 000000000000..c606ebc73dd4 --- /dev/null +++ b/benchmarks/queries/imdb/14b.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS western_dark_production FROM info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title') AND kt.kind = 'movie' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info > '6.0' AND t.production_year > 2010 and (t.title like '%murder%' or t.title like '%Murder%' or t.title like '%Mord%') AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/14c.sql b/benchmarks/queries/imdb/14c.sql new file mode 100644 index 000000000000..2a6dffde2639 --- /dev/null +++ b/benchmarks/queries/imdb/14c.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS north_european_dark_production FROM info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it1.info = 'countries' AND it2.info = 'rating' AND k.keyword is not null and k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Danish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/15a.sql b/benchmarks/queries/imdb/15a.sql new file mode 100644 index 000000000000..1d052f004426 --- /dev/null +++ b/benchmarks/queries/imdb/15a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS release_date, MIN(t.title) AS internet_movie FROM aka_title AS at, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cn.country_code = '[us]' AND it1.info = 'release dates' AND mc.note like '%(200%)%' and mc.note like '%(worldwide)%' AND mi.note like '%internet%' AND mi.info like 'USA:% 200%' AND t.production_year > 2000 AND t.id = at.movie_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = at.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = at.movie_id AND mc.movie_id = at.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/15b.sql b/benchmarks/queries/imdb/15b.sql new file mode 100644 index 000000000000..21c81358fa7a --- /dev/null +++ b/benchmarks/queries/imdb/15b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS release_date, MIN(t.title) AS youtube_movie FROM aka_title AS at, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cn.country_code = '[us]' and cn.name = 'YouTube' AND it1.info = 'release dates' AND mc.note like '%(200%)%' and mc.note like '%(worldwide)%' AND mi.note like '%internet%' AND mi.info like 'USA:% 200%' AND t.production_year between 2005 and 2010 AND t.id = at.movie_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = at.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = at.movie_id AND mc.movie_id = at.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/15c.sql b/benchmarks/queries/imdb/15c.sql new file mode 100644 index 000000000000..2d08c5203974 --- /dev/null +++ b/benchmarks/queries/imdb/15c.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS release_date, MIN(t.title) AS modern_american_internet_movie FROM aka_title AS at, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cn.country_code = '[us]' AND it1.info = 'release dates' AND mi.note like '%internet%' AND mi.info is not NULL and (mi.info like 'USA:% 199%' or mi.info like 'USA:% 200%') AND t.production_year > 1990 AND t.id = at.movie_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = at.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = at.movie_id AND mc.movie_id = at.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/15d.sql b/benchmarks/queries/imdb/15d.sql new file mode 100644 index 000000000000..040e9815d86c --- /dev/null +++ b/benchmarks/queries/imdb/15d.sql @@ -0,0 +1 @@ +SELECT MIN(at.title) AS aka_title, MIN(t.title) AS internet_movie_title FROM aka_title AS at, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cn.country_code = '[us]' AND it1.info = 'release dates' AND mi.note like '%internet%' AND t.production_year > 1990 AND t.id = at.movie_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = at.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = at.movie_id AND mc.movie_id = at.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id; diff --git a/benchmarks/queries/imdb/16a.sql b/benchmarks/queries/imdb/16a.sql new file mode 100644 index 000000000000..aaa0020269d2 --- /dev/null +++ b/benchmarks/queries/imdb/16a.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, MIN(t.title) AS series_named_after_char FROM aka_name AS an, cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND t.episode_nr >= 50 AND t.episode_nr < 100 AND an.person_id = n.id AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND an.person_id = ci.person_id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/16b.sql b/benchmarks/queries/imdb/16b.sql new file mode 100644 index 000000000000..c6c0bef319de --- /dev/null +++ b/benchmarks/queries/imdb/16b.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, MIN(t.title) AS series_named_after_char FROM aka_name AS an, cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND an.person_id = n.id AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND an.person_id = ci.person_id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/16c.sql b/benchmarks/queries/imdb/16c.sql new file mode 100644 index 000000000000..5c3b35752195 --- /dev/null +++ b/benchmarks/queries/imdb/16c.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, MIN(t.title) AS series_named_after_char FROM aka_name AS an, cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND t.episode_nr < 100 AND an.person_id = n.id AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND an.person_id = ci.person_id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/16d.sql b/benchmarks/queries/imdb/16d.sql new file mode 100644 index 000000000000..c9e1b5f25ce5 --- /dev/null +++ b/benchmarks/queries/imdb/16d.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS cool_actor_pseudonym, MIN(t.title) AS series_named_after_char FROM aka_name AS an, cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND t.episode_nr >= 5 AND t.episode_nr < 100 AND an.person_id = n.id AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND an.person_id = ci.person_id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17a.sql b/benchmarks/queries/imdb/17a.sql new file mode 100644 index 000000000000..e854a957e429 --- /dev/null +++ b/benchmarks/queries/imdb/17a.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_american_movie, MIN(n.name) AS a1 FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND n.name LIKE 'B%' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17b.sql b/benchmarks/queries/imdb/17b.sql new file mode 100644 index 000000000000..903f2196b278 --- /dev/null +++ b/benchmarks/queries/imdb/17b.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie, MIN(n.name) AS a1 FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword ='character-name-in-title' AND n.name LIKE 'Z%' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17c.sql b/benchmarks/queries/imdb/17c.sql new file mode 100644 index 000000000000..a96faa0b4339 --- /dev/null +++ b/benchmarks/queries/imdb/17c.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie, MIN(n.name) AS a1 FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword ='character-name-in-title' AND n.name LIKE 'X%' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17d.sql b/benchmarks/queries/imdb/17d.sql new file mode 100644 index 000000000000..73e1f2c30976 --- /dev/null +++ b/benchmarks/queries/imdb/17d.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword ='character-name-in-title' AND n.name LIKE '%Bert%' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17e.sql b/benchmarks/queries/imdb/17e.sql new file mode 100644 index 000000000000..65ea73ed0510 --- /dev/null +++ b/benchmarks/queries/imdb/17e.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/17f.sql b/benchmarks/queries/imdb/17f.sql new file mode 100644 index 000000000000..542233d63e9d --- /dev/null +++ b/benchmarks/queries/imdb/17f.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS member_in_charnamed_movie FROM cast_info AS ci, company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword ='character-name-in-title' AND n.name LIKE '%B%' AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.movie_id = mc.movie_id AND ci.movie_id = mk.movie_id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/18a.sql b/benchmarks/queries/imdb/18a.sql new file mode 100644 index 000000000000..275e04bdb184 --- /dev/null +++ b/benchmarks/queries/imdb/18a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(t.title) AS movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, movie_info AS mi, movie_info_idx AS mi_idx, name AS n, title AS t WHERE ci.note in ('(producer)', '(executive producer)') AND it1.info = 'budget' AND it2.info = 'votes' AND n.gender = 'm' and n.name like '%Tim%' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/18b.sql b/benchmarks/queries/imdb/18b.sql new file mode 100644 index 000000000000..3ae40ed93d2f --- /dev/null +++ b/benchmarks/queries/imdb/18b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(t.title) AS movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, movie_info AS mi, movie_info_idx AS mi_idx, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'rating' AND mi.info in ('Horror', 'Thriller') and mi.note is NULL AND mi_idx.info > '8.0' AND n.gender is not null and n.gender = 'f' AND t.production_year between 2008 and 2014 AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/18c.sql b/benchmarks/queries/imdb/18c.sql new file mode 100644 index 000000000000..01f28ea527fe --- /dev/null +++ b/benchmarks/queries/imdb/18c.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(t.title) AS movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, movie_info AS mi, movie_info_idx AS mi_idx, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND mi.info in ('Horror', 'Action', 'Sci-Fi', 'Thriller', 'Crime', 'War') AND n.gender = 'm' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND mi.movie_id = mi_idx.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/19a.sql b/benchmarks/queries/imdb/19a.sql new file mode 100644 index 000000000000..ceaae671fd20 --- /dev/null +++ b/benchmarks/queries/imdb/19a.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS voicing_actress, MIN(t.title) AS voiced_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, movie_companies AS mc, movie_info AS mi, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND mc.note is not NULL and (mc.note like '%(USA)%' or mc.note like '%(worldwide)%') AND mi.info is not null and (mi.info like 'Japan:%200%' or mi.info like 'USA:%200%') AND n.gender ='f' and n.name like '%Ang%' AND rt.role ='actress' AND t.production_year between 2005 and 2009 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mi.movie_id = ci.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id; diff --git a/benchmarks/queries/imdb/19b.sql b/benchmarks/queries/imdb/19b.sql new file mode 100644 index 000000000000..62e852ba3ec6 --- /dev/null +++ b/benchmarks/queries/imdb/19b.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS voicing_actress, MIN(t.title) AS kung_fu_panda FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, movie_companies AS mc, movie_info AS mi, name AS n, role_type AS rt, title AS t WHERE ci.note = '(voice)' AND cn.country_code ='[us]' AND it.info = 'release dates' AND mc.note like '%(200%)%' and (mc.note like '%(USA)%' or mc.note like '%(worldwide)%') AND mi.info is not null and (mi.info like 'Japan:%2007%' or mi.info like 'USA:%2008%') AND n.gender ='f' and n.name like '%Angel%' AND rt.role ='actress' AND t.production_year between 2007 and 2008 and t.title like '%Kung%Fu%Panda%' AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mi.movie_id = ci.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id; diff --git a/benchmarks/queries/imdb/19c.sql b/benchmarks/queries/imdb/19c.sql new file mode 100644 index 000000000000..6885af5012fc --- /dev/null +++ b/benchmarks/queries/imdb/19c.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS voicing_actress, MIN(t.title) AS jap_engl_voiced_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, movie_companies AS mc, movie_info AS mi, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND mi.info is not null and (mi.info like 'Japan:%200%' or mi.info like 'USA:%200%') AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.production_year > 2000 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mi.movie_id = ci.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id; diff --git a/benchmarks/queries/imdb/19d.sql b/benchmarks/queries/imdb/19d.sql new file mode 100644 index 000000000000..06fcc76ba7ad --- /dev/null +++ b/benchmarks/queries/imdb/19d.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS voicing_actress, MIN(t.title) AS jap_engl_voiced_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, movie_companies AS mc, movie_info AS mi, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND n.gender ='f' AND rt.role ='actress' AND t.production_year > 2000 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mi.movie_id = ci.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id; diff --git a/benchmarks/queries/imdb/1a.sql b/benchmarks/queries/imdb/1a.sql new file mode 100644 index 000000000000..07b351638857 --- /dev/null +++ b/benchmarks/queries/imdb/1a.sql @@ -0,0 +1 @@ +SELECT MIN(mc.note) AS production_note, MIN(t.title) AS movie_title, MIN(t.production_year) AS movie_year FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info_idx AS mi_idx, title AS t WHERE ct.kind = 'production companies' AND it.info = 'top 250 rank' AND mc.note not like '%(as Metro-Goldwyn-Mayer Pictures)%' and (mc.note like '%(co-production)%' or mc.note like '%(presents)%') AND ct.id = mc.company_type_id AND t.id = mc.movie_id AND t.id = mi_idx.movie_id AND mc.movie_id = mi_idx.movie_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/1b.sql b/benchmarks/queries/imdb/1b.sql new file mode 100644 index 000000000000..f2901e8b5262 --- /dev/null +++ b/benchmarks/queries/imdb/1b.sql @@ -0,0 +1 @@ +SELECT MIN(mc.note) AS production_note, MIN(t.title) AS movie_title, MIN(t.production_year) AS movie_year FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info_idx AS mi_idx, title AS t WHERE ct.kind = 'production companies' AND it.info = 'bottom 10 rank' AND mc.note not like '%(as Metro-Goldwyn-Mayer Pictures)%' AND t.production_year between 2005 and 2010 AND ct.id = mc.company_type_id AND t.id = mc.movie_id AND t.id = mi_idx.movie_id AND mc.movie_id = mi_idx.movie_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/1c.sql b/benchmarks/queries/imdb/1c.sql new file mode 100644 index 000000000000..94e66c30aa14 --- /dev/null +++ b/benchmarks/queries/imdb/1c.sql @@ -0,0 +1 @@ +SELECT MIN(mc.note) AS production_note, MIN(t.title) AS movie_title, MIN(t.production_year) AS movie_year FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info_idx AS mi_idx, title AS t WHERE ct.kind = 'production companies' AND it.info = 'top 250 rank' AND mc.note not like '%(as Metro-Goldwyn-Mayer Pictures)%' and (mc.note like '%(co-production)%') AND t.production_year >2010 AND ct.id = mc.company_type_id AND t.id = mc.movie_id AND t.id = mi_idx.movie_id AND mc.movie_id = mi_idx.movie_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/1d.sql b/benchmarks/queries/imdb/1d.sql new file mode 100644 index 000000000000..52f58e80c811 --- /dev/null +++ b/benchmarks/queries/imdb/1d.sql @@ -0,0 +1 @@ +SELECT MIN(mc.note) AS production_note, MIN(t.title) AS movie_title, MIN(t.production_year) AS movie_year FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info_idx AS mi_idx, title AS t WHERE ct.kind = 'production companies' AND it.info = 'bottom 10 rank' AND mc.note not like '%(as Metro-Goldwyn-Mayer Pictures)%' AND t.production_year >2000 AND ct.id = mc.company_type_id AND t.id = mc.movie_id AND t.id = mi_idx.movie_id AND mc.movie_id = mi_idx.movie_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/20a.sql b/benchmarks/queries/imdb/20a.sql new file mode 100644 index 000000000000..2a1c269d6a51 --- /dev/null +++ b/benchmarks/queries/imdb/20a.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS complete_downey_ironman_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, keyword AS k, kind_type AS kt, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name not like '%Sherlock%' and (chn.name like '%Tony%Stark%' or chn.name like '%Iron%Man%') AND k.keyword in ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND kt.kind = 'movie' AND t.production_year > 1950 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND ci.movie_id = cc.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/20b.sql b/benchmarks/queries/imdb/20b.sql new file mode 100644 index 000000000000..4c2455a52eb1 --- /dev/null +++ b/benchmarks/queries/imdb/20b.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS complete_downey_ironman_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, keyword AS k, kind_type AS kt, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name not like '%Sherlock%' and (chn.name like '%Tony%Stark%' or chn.name like '%Iron%Man%') AND k.keyword in ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND kt.kind = 'movie' AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND ci.movie_id = cc.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/20c.sql b/benchmarks/queries/imdb/20c.sql new file mode 100644 index 000000000000..b85b22f6b4f2 --- /dev/null +++ b/benchmarks/queries/imdb/20c.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS cast_member, MIN(t.title) AS complete_dynamic_hero_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, keyword AS k, kind_type AS kt, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name is not NULL and (chn.name like '%man%' or chn.name like '%Man%') AND k.keyword in ('superhero', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence', 'magnet', 'web', 'claw', 'laser') AND kt.kind = 'movie' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND ci.movie_id = cc.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/21a.sql b/benchmarks/queries/imdb/21a.sql new file mode 100644 index 000000000000..8a66a00be6cb --- /dev/null +++ b/benchmarks/queries/imdb/21a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS company_name, MIN(lt.link) AS link_type, MIN(t.title) AS western_follow_up FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German') AND t.production_year BETWEEN 1950 AND 2000 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id; diff --git a/benchmarks/queries/imdb/21b.sql b/benchmarks/queries/imdb/21b.sql new file mode 100644 index 000000000000..90d3a5a4c078 --- /dev/null +++ b/benchmarks/queries/imdb/21b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS company_name, MIN(lt.link) AS link_type, MIN(t.title) AS german_follow_up FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Germany', 'German') AND t.production_year BETWEEN 2000 AND 2010 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id; diff --git a/benchmarks/queries/imdb/21c.sql b/benchmarks/queries/imdb/21c.sql new file mode 100644 index 000000000000..16a42ae6f426 --- /dev/null +++ b/benchmarks/queries/imdb/21c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS company_name, MIN(lt.link) AS link_type, MIN(t.title) AS western_follow_up FROM company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'English') AND t.production_year BETWEEN 1950 AND 2010 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id; diff --git a/benchmarks/queries/imdb/22a.sql b/benchmarks/queries/imdb/22a.sql new file mode 100644 index 000000000000..e513799698c5 --- /dev/null +++ b/benchmarks/queries/imdb/22a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS western_violent_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Germany', 'German', 'USA', 'American') AND mi_idx.info < '7.0' AND t.production_year > 2008 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mc.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/22b.sql b/benchmarks/queries/imdb/22b.sql new file mode 100644 index 000000000000..f98d0ea8099d --- /dev/null +++ b/benchmarks/queries/imdb/22b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS western_violent_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Germany', 'German', 'USA', 'American') AND mi_idx.info < '7.0' AND t.production_year > 2009 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mc.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/22c.sql b/benchmarks/queries/imdb/22c.sql new file mode 100644 index 000000000000..cf757956e0de --- /dev/null +++ b/benchmarks/queries/imdb/22c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS western_violent_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Danish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mc.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/22d.sql b/benchmarks/queries/imdb/22d.sql new file mode 100644 index 000000000000..a47feeb05157 --- /dev/null +++ b/benchmarks/queries/imdb/22d.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS western_violent_movie FROM company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Danish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mc.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/23a.sql b/benchmarks/queries/imdb/23a.sql new file mode 100644 index 000000000000..724da913b51a --- /dev/null +++ b/benchmarks/queries/imdb/23a.sql @@ -0,0 +1 @@ +SELECT MIN(kt.kind) AS movie_kind, MIN(t.title) AS complete_us_internet_movie FROM complete_cast AS cc, comp_cast_type AS cct1, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cct1.kind = 'complete+verified' AND cn.country_code = '[us]' AND it1.info = 'release dates' AND kt.kind in ('movie') AND mi.note like '%internet%' AND mi.info is not NULL and (mi.info like 'USA:% 199%' or mi.info like 'USA:% 200%') AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND cct1.id = cc.status_id; diff --git a/benchmarks/queries/imdb/23b.sql b/benchmarks/queries/imdb/23b.sql new file mode 100644 index 000000000000..e39f0ecc28a2 --- /dev/null +++ b/benchmarks/queries/imdb/23b.sql @@ -0,0 +1 @@ +SELECT MIN(kt.kind) AS movie_kind, MIN(t.title) AS complete_nerdy_internet_movie FROM complete_cast AS cc, comp_cast_type AS cct1, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cct1.kind = 'complete+verified' AND cn.country_code = '[us]' AND it1.info = 'release dates' AND k.keyword in ('nerd', 'loner', 'alienation', 'dignity') AND kt.kind in ('movie') AND mi.note like '%internet%' AND mi.info like 'USA:% 200%' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND cct1.id = cc.status_id; diff --git a/benchmarks/queries/imdb/23c.sql b/benchmarks/queries/imdb/23c.sql new file mode 100644 index 000000000000..839d762d0533 --- /dev/null +++ b/benchmarks/queries/imdb/23c.sql @@ -0,0 +1 @@ +SELECT MIN(kt.kind) AS movie_kind, MIN(t.title) AS complete_us_internet_movie FROM complete_cast AS cc, comp_cast_type AS cct1, company_name AS cn, company_type AS ct, info_type AS it1, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, title AS t WHERE cct1.kind = 'complete+verified' AND cn.country_code = '[us]' AND it1.info = 'release dates' AND kt.kind in ('movie', 'tv movie', 'video movie', 'video game') AND mi.note like '%internet%' AND mi.info is not NULL and (mi.info like 'USA:% 199%' or mi.info like 'USA:% 200%') AND t.production_year > 1990 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND cn.id = mc.company_id AND ct.id = mc.company_type_id AND cct1.id = cc.status_id; diff --git a/benchmarks/queries/imdb/24a.sql b/benchmarks/queries/imdb/24a.sql new file mode 100644 index 000000000000..8f10621e0209 --- /dev/null +++ b/benchmarks/queries/imdb/24a.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS voiced_char_name, MIN(n.name) AS voicing_actress_name, MIN(t.title) AS voiced_action_movie_jap_eng FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND k.keyword in ('hero', 'martial-arts', 'hand-to-hand-combat') AND mi.info is not null and (mi.info like 'Japan:%201%' or mi.info like 'USA:%201%') AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.production_year > 2010 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mk.movie_id AND mi.movie_id = ci.movie_id AND mi.movie_id = mk.movie_id AND ci.movie_id = mk.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/24b.sql b/benchmarks/queries/imdb/24b.sql new file mode 100644 index 000000000000..d8a2836000b2 --- /dev/null +++ b/benchmarks/queries/imdb/24b.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS voiced_char_name, MIN(n.name) AS voicing_actress_name, MIN(t.title) AS kung_fu_panda FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND cn.name = 'DreamWorks Animation' AND it.info = 'release dates' AND k.keyword in ('hero', 'martial-arts', 'hand-to-hand-combat', 'computer-animated-movie') AND mi.info is not null and (mi.info like 'Japan:%201%' or mi.info like 'USA:%201%') AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.production_year > 2010 AND t.title like 'Kung Fu Panda%' AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mk.movie_id AND mi.movie_id = ci.movie_id AND mi.movie_id = mk.movie_id AND ci.movie_id = mk.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/25a.sql b/benchmarks/queries/imdb/25a.sql new file mode 100644 index 000000000000..bc55cc01d26b --- /dev/null +++ b/benchmarks/queries/imdb/25a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS male_writer, MIN(t.title) AS violent_movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'blood', 'gore', 'death', 'female-nudity') AND mi.info = 'Horror' AND n.gender = 'm' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi_idx.movie_id = mk.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/25b.sql b/benchmarks/queries/imdb/25b.sql new file mode 100644 index 000000000000..3457655bb9eb --- /dev/null +++ b/benchmarks/queries/imdb/25b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS male_writer, MIN(t.title) AS violent_movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'blood', 'gore', 'death', 'female-nudity') AND mi.info = 'Horror' AND n.gender = 'm' AND t.production_year > 2010 AND t.title like 'Vampire%' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi_idx.movie_id = mk.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/25c.sql b/benchmarks/queries/imdb/25c.sql new file mode 100644 index 000000000000..cf56a313d861 --- /dev/null +++ b/benchmarks/queries/imdb/25c.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS male_writer, MIN(t.title) AS violent_movie_title FROM cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Action', 'Sci-Fi', 'Thriller', 'Crime', 'War') AND n.gender = 'm' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi_idx.movie_id = mk.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/26a.sql b/benchmarks/queries/imdb/26a.sql new file mode 100644 index 000000000000..b431f204c6dc --- /dev/null +++ b/benchmarks/queries/imdb/26a.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS character_name, MIN(mi_idx.info) AS rating, MIN(n.name) AS playing_actor, MIN(t.title) AS complete_hero_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, info_type AS it2, keyword AS k, kind_type AS kt, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name is not NULL and (chn.name like '%man%' or chn.name like '%Man%') AND it2.info = 'rating' AND k.keyword in ('superhero', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence', 'magnet', 'web', 'claw', 'laser') AND kt.kind = 'movie' AND mi_idx.info > '7.0' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND mk.movie_id = mi_idx.movie_id AND ci.movie_id = cc.movie_id AND ci.movie_id = mi_idx.movie_id AND cc.movie_id = mi_idx.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/26b.sql b/benchmarks/queries/imdb/26b.sql new file mode 100644 index 000000000000..882d234d77e0 --- /dev/null +++ b/benchmarks/queries/imdb/26b.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS character_name, MIN(mi_idx.info) AS rating, MIN(t.title) AS complete_hero_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, info_type AS it2, keyword AS k, kind_type AS kt, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name is not NULL and (chn.name like '%man%' or chn.name like '%Man%') AND it2.info = 'rating' AND k.keyword in ('superhero', 'marvel-comics', 'based-on-comic', 'fight') AND kt.kind = 'movie' AND mi_idx.info > '8.0' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND mk.movie_id = mi_idx.movie_id AND ci.movie_id = cc.movie_id AND ci.movie_id = mi_idx.movie_id AND cc.movie_id = mi_idx.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/26c.sql b/benchmarks/queries/imdb/26c.sql new file mode 100644 index 000000000000..4b9eae0b7633 --- /dev/null +++ b/benchmarks/queries/imdb/26c.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS character_name, MIN(mi_idx.info) AS rating, MIN(t.title) AS complete_hero_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, info_type AS it2, keyword AS k, kind_type AS kt, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like '%complete%' AND chn.name is not NULL and (chn.name like '%man%' or chn.name like '%Man%') AND it2.info = 'rating' AND k.keyword in ('superhero', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence', 'magnet', 'web', 'claw', 'laser') AND kt.kind = 'movie' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND t.id = cc.movie_id AND t.id = mi_idx.movie_id AND mk.movie_id = ci.movie_id AND mk.movie_id = cc.movie_id AND mk.movie_id = mi_idx.movie_id AND ci.movie_id = cc.movie_id AND ci.movie_id = mi_idx.movie_id AND cc.movie_id = mi_idx.movie_id AND chn.id = ci.person_role_id AND n.id = ci.person_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND it2.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/27a.sql b/benchmarks/queries/imdb/27a.sql new file mode 100644 index 000000000000..239673cd8147 --- /dev/null +++ b/benchmarks/queries/imdb/27a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(lt.link) AS link_type, MIN(t.title) AS complete_western_sequel FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cct1.kind in ('cast', 'crew') AND cct2.kind = 'complete' AND cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Sweden', 'Germany','Swedish', 'German') AND t.production_year BETWEEN 1950 AND 2000 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND t.id = cc.movie_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id AND ml.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = cc.movie_id; diff --git a/benchmarks/queries/imdb/27b.sql b/benchmarks/queries/imdb/27b.sql new file mode 100644 index 000000000000..4bf85260f22d --- /dev/null +++ b/benchmarks/queries/imdb/27b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(lt.link) AS link_type, MIN(t.title) AS complete_western_sequel FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cct1.kind in ('cast', 'crew') AND cct2.kind = 'complete' AND cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Sweden', 'Germany','Swedish', 'German') AND t.production_year = 1998 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND t.id = cc.movie_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id AND ml.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = cc.movie_id; diff --git a/benchmarks/queries/imdb/27c.sql b/benchmarks/queries/imdb/27c.sql new file mode 100644 index 000000000000..dc26ebff6851 --- /dev/null +++ b/benchmarks/queries/imdb/27c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS producing_company, MIN(lt.link) AS link_type, MIN(t.title) AS complete_western_sequel FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, keyword AS k, link_type AS lt, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, movie_link AS ml, title AS t WHERE cct1.kind = 'cast' AND cct2.kind like 'complete%' AND cn.country_code !='[pl]' AND (cn.name LIKE '%Film%' OR cn.name LIKE '%Warner%') AND ct.kind ='production companies' AND k.keyword ='sequel' AND lt.link LIKE '%follow%' AND mc.note IS NULL AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'English') AND t.production_year BETWEEN 1950 AND 2010 AND lt.id = ml.link_type_id AND ml.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND t.id = mc.movie_id AND mc.company_type_id = ct.id AND mc.company_id = cn.id AND mi.movie_id = t.id AND t.id = cc.movie_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id AND ml.movie_id = mk.movie_id AND ml.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND ml.movie_id = mi.movie_id AND mk.movie_id = mi.movie_id AND mc.movie_id = mi.movie_id AND ml.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = cc.movie_id; diff --git a/benchmarks/queries/imdb/28a.sql b/benchmarks/queries/imdb/28a.sql new file mode 100644 index 000000000000..8cb1177386da --- /dev/null +++ b/benchmarks/queries/imdb/28a.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS complete_euro_dark_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cct1.kind = 'crew' AND cct2.kind != 'complete+verified' AND cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Danish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2000 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = mi_idx.movie_id AND mc.movie_id = cc.movie_id AND mi_idx.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/28b.sql b/benchmarks/queries/imdb/28b.sql new file mode 100644 index 000000000000..10f43c898226 --- /dev/null +++ b/benchmarks/queries/imdb/28b.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS complete_euro_dark_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cct1.kind = 'crew' AND cct2.kind != 'complete+verified' AND cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Sweden', 'Germany', 'Swedish', 'German') AND mi_idx.info > '6.5' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = mi_idx.movie_id AND mc.movie_id = cc.movie_id AND mi_idx.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/28c.sql b/benchmarks/queries/imdb/28c.sql new file mode 100644 index 000000000000..6b2e4047ae8a --- /dev/null +++ b/benchmarks/queries/imdb/28c.sql @@ -0,0 +1 @@ +SELECT MIN(cn.name) AS movie_company, MIN(mi_idx.info) AS rating, MIN(t.title) AS complete_euro_dark_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, company_name AS cn, company_type AS ct, info_type AS it1, info_type AS it2, keyword AS k, kind_type AS kt, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE cct1.kind = 'cast' AND cct2.kind = 'complete' AND cn.country_code != '[us]' AND it1.info = 'countries' AND it2.info = 'rating' AND k.keyword in ('murder', 'murder-in-title', 'blood', 'violence') AND kt.kind in ('movie', 'episode') AND mc.note not like '%(USA)%' and mc.note like '%(200%)%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Danish', 'Norwegian', 'German', 'USA', 'American') AND mi_idx.info < '8.5' AND t.production_year > 2005 AND kt.id = t.kind_id AND t.id = mi.movie_id AND t.id = mk.movie_id AND t.id = mi_idx.movie_id AND t.id = mc.movie_id AND t.id = cc.movie_id AND mk.movie_id = mi.movie_id AND mk.movie_id = mi_idx.movie_id AND mk.movie_id = mc.movie_id AND mk.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mc.movie_id AND mi.movie_id = cc.movie_id AND mc.movie_id = mi_idx.movie_id AND mc.movie_id = cc.movie_id AND mi_idx.movie_id = cc.movie_id AND k.id = mk.keyword_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND ct.id = mc.company_type_id AND cn.id = mc.company_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/29a.sql b/benchmarks/queries/imdb/29a.sql new file mode 100644 index 000000000000..3033acbe6cf3 --- /dev/null +++ b/benchmarks/queries/imdb/29a.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS voiced_char, MIN(n.name) AS voicing_actress, MIN(t.title) AS voiced_animation FROM aka_name AS an, complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, info_type AS it3, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, name AS n, person_info AS pi, role_type AS rt, title AS t WHERE cct1.kind ='cast' AND cct2.kind ='complete+verified' AND chn.name = 'Queen' AND ci.note in ('(voice)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND it3.info = 'trivia' AND k.keyword = 'computer-animation' AND mi.info is not null and (mi.info like 'Japan:%200%' or mi.info like 'USA:%200%') AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.title = 'Shrek 2' AND t.production_year between 2000 and 2010 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mk.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = ci.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id AND n.id = pi.person_id AND ci.person_id = pi.person_id AND it3.id = pi.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/29b.sql b/benchmarks/queries/imdb/29b.sql new file mode 100644 index 000000000000..88d50fc7b783 --- /dev/null +++ b/benchmarks/queries/imdb/29b.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS voiced_char, MIN(n.name) AS voicing_actress, MIN(t.title) AS voiced_animation FROM aka_name AS an, complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, info_type AS it3, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, name AS n, person_info AS pi, role_type AS rt, title AS t WHERE cct1.kind ='cast' AND cct2.kind ='complete+verified' AND chn.name = 'Queen' AND ci.note in ('(voice)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND it3.info = 'height' AND k.keyword = 'computer-animation' AND mi.info like 'USA:%200%' AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.title = 'Shrek 2' AND t.production_year between 2000 and 2005 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mk.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = ci.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id AND n.id = pi.person_id AND ci.person_id = pi.person_id AND it3.id = pi.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/29c.sql b/benchmarks/queries/imdb/29c.sql new file mode 100644 index 000000000000..cb951781827c --- /dev/null +++ b/benchmarks/queries/imdb/29c.sql @@ -0,0 +1 @@ +SELECT MIN(chn.name) AS voiced_char, MIN(n.name) AS voicing_actress, MIN(t.title) AS voiced_animation FROM aka_name AS an, complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, char_name AS chn, cast_info AS ci, company_name AS cn, info_type AS it, info_type AS it3, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_keyword AS mk, name AS n, person_info AS pi, role_type AS rt, title AS t WHERE cct1.kind ='cast' AND cct2.kind ='complete+verified' AND ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND it.info = 'release dates' AND it3.info = 'trivia' AND k.keyword = 'computer-animation' AND mi.info is not null and (mi.info like 'Japan:%200%' or mi.info like 'USA:%200%') AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND t.production_year between 2000 and 2010 AND t.id = mi.movie_id AND t.id = mc.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND mc.movie_id = ci.movie_id AND mc.movie_id = mi.movie_id AND mc.movie_id = mk.movie_id AND mc.movie_id = cc.movie_id AND mi.movie_id = ci.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND cn.id = mc.company_id AND it.id = mi.info_type_id AND n.id = ci.person_id AND rt.id = ci.role_id AND n.id = an.person_id AND ci.person_id = an.person_id AND chn.id = ci.person_role_id AND n.id = pi.person_id AND ci.person_id = pi.person_id AND it3.id = pi.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/2a.sql b/benchmarks/queries/imdb/2a.sql new file mode 100644 index 000000000000..f3ef4db75fea --- /dev/null +++ b/benchmarks/queries/imdb/2a.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, title AS t WHERE cn.country_code ='[de]' AND k.keyword ='character-name-in-title' AND cn.id = mc.company_id AND mc.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/2b.sql b/benchmarks/queries/imdb/2b.sql new file mode 100644 index 000000000000..82b2123fbccd --- /dev/null +++ b/benchmarks/queries/imdb/2b.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, title AS t WHERE cn.country_code ='[nl]' AND k.keyword ='character-name-in-title' AND cn.id = mc.company_id AND mc.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/2c.sql b/benchmarks/queries/imdb/2c.sql new file mode 100644 index 000000000000..b5f9b75dd68b --- /dev/null +++ b/benchmarks/queries/imdb/2c.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, title AS t WHERE cn.country_code ='[sm]' AND k.keyword ='character-name-in-title' AND cn.id = mc.company_id AND mc.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/2d.sql b/benchmarks/queries/imdb/2d.sql new file mode 100644 index 000000000000..4a2791946548 --- /dev/null +++ b/benchmarks/queries/imdb/2d.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM company_name AS cn, keyword AS k, movie_companies AS mc, movie_keyword AS mk, title AS t WHERE cn.country_code ='[us]' AND k.keyword ='character-name-in-title' AND cn.id = mc.company_id AND mc.movie_id = t.id AND t.id = mk.movie_id AND mk.keyword_id = k.id AND mc.movie_id = mk.movie_id; diff --git a/benchmarks/queries/imdb/30a.sql b/benchmarks/queries/imdb/30a.sql new file mode 100644 index 000000000000..698872fa8337 --- /dev/null +++ b/benchmarks/queries/imdb/30a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS complete_violent_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind in ('cast', 'crew') AND cct2.kind ='complete+verified' AND ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Thriller') AND n.gender = 'm' AND t.production_year > 2000 AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/30b.sql b/benchmarks/queries/imdb/30b.sql new file mode 100644 index 000000000000..5fdb8493496c --- /dev/null +++ b/benchmarks/queries/imdb/30b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS complete_gore_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind in ('cast', 'crew') AND cct2.kind ='complete+verified' AND ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Thriller') AND n.gender = 'm' AND t.production_year > 2000 and (t.title like '%Freddy%' or t.title like '%Jason%' or t.title like 'Saw%') AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/30c.sql b/benchmarks/queries/imdb/30c.sql new file mode 100644 index 000000000000..a18087e39222 --- /dev/null +++ b/benchmarks/queries/imdb/30c.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS complete_violent_movie FROM complete_cast AS cc, comp_cast_type AS cct1, comp_cast_type AS cct2, cast_info AS ci, info_type AS it1, info_type AS it2, keyword AS k, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE cct1.kind = 'cast' AND cct2.kind ='complete+verified' AND ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Action', 'Sci-Fi', 'Thriller', 'Crime', 'War') AND n.gender = 'm' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = cc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = cc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = cc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = cc.movie_id AND mk.movie_id = cc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cct1.id = cc.subject_id AND cct2.id = cc.status_id; diff --git a/benchmarks/queries/imdb/31a.sql b/benchmarks/queries/imdb/31a.sql new file mode 100644 index 000000000000..7dd855011f2a --- /dev/null +++ b/benchmarks/queries/imdb/31a.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS violent_liongate_movie FROM cast_info AS ci, company_name AS cn, info_type AS it1, info_type AS it2, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND cn.name like 'Lionsgate%' AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Thriller') AND n.gender = 'm' AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = mc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/31b.sql b/benchmarks/queries/imdb/31b.sql new file mode 100644 index 000000000000..3be5680f7d00 --- /dev/null +++ b/benchmarks/queries/imdb/31b.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS violent_liongate_movie FROM cast_info AS ci, company_name AS cn, info_type AS it1, info_type AS it2, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND cn.name like 'Lionsgate%' AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mc.note like '%(Blu-ray)%' AND mi.info in ('Horror', 'Thriller') AND n.gender = 'm' AND t.production_year > 2000 and (t.title like '%Freddy%' or t.title like '%Jason%' or t.title like 'Saw%') AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = mc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/31c.sql b/benchmarks/queries/imdb/31c.sql new file mode 100644 index 000000000000..156ea2d5eee2 --- /dev/null +++ b/benchmarks/queries/imdb/31c.sql @@ -0,0 +1 @@ +SELECT MIN(mi.info) AS movie_budget, MIN(mi_idx.info) AS movie_votes, MIN(n.name) AS writer, MIN(t.title) AS violent_liongate_movie FROM cast_info AS ci, company_name AS cn, info_type AS it1, info_type AS it2, keyword AS k, movie_companies AS mc, movie_info AS mi, movie_info_idx AS mi_idx, movie_keyword AS mk, name AS n, title AS t WHERE ci.note in ('(writer)', '(head writer)', '(written by)', '(story)', '(story editor)') AND cn.name like 'Lionsgate%' AND it1.info = 'genres' AND it2.info = 'votes' AND k.keyword in ('murder', 'violence', 'blood', 'gore', 'death', 'female-nudity', 'hospital') AND mi.info in ('Horror', 'Action', 'Sci-Fi', 'Thriller', 'Crime', 'War') AND t.id = mi.movie_id AND t.id = mi_idx.movie_id AND t.id = ci.movie_id AND t.id = mk.movie_id AND t.id = mc.movie_id AND ci.movie_id = mi.movie_id AND ci.movie_id = mi_idx.movie_id AND ci.movie_id = mk.movie_id AND ci.movie_id = mc.movie_id AND mi.movie_id = mi_idx.movie_id AND mi.movie_id = mk.movie_id AND mi.movie_id = mc.movie_id AND mi_idx.movie_id = mk.movie_id AND mi_idx.movie_id = mc.movie_id AND mk.movie_id = mc.movie_id AND n.id = ci.person_id AND it1.id = mi.info_type_id AND it2.id = mi_idx.info_type_id AND k.id = mk.keyword_id AND cn.id = mc.company_id; diff --git a/benchmarks/queries/imdb/32a.sql b/benchmarks/queries/imdb/32a.sql new file mode 100644 index 000000000000..9647fb71065d --- /dev/null +++ b/benchmarks/queries/imdb/32a.sql @@ -0,0 +1 @@ +SELECT MIN(lt.link) AS link_type, MIN(t1.title) AS first_movie, MIN(t2.title) AS second_movie FROM keyword AS k, link_type AS lt, movie_keyword AS mk, movie_link AS ml, title AS t1, title AS t2 WHERE k.keyword ='10,000-mile-club' AND mk.keyword_id = k.id AND t1.id = mk.movie_id AND ml.movie_id = t1.id AND ml.linked_movie_id = t2.id AND lt.id = ml.link_type_id AND mk.movie_id = t1.id; diff --git a/benchmarks/queries/imdb/32b.sql b/benchmarks/queries/imdb/32b.sql new file mode 100644 index 000000000000..6d096ab43405 --- /dev/null +++ b/benchmarks/queries/imdb/32b.sql @@ -0,0 +1 @@ +SELECT MIN(lt.link) AS link_type, MIN(t1.title) AS first_movie, MIN(t2.title) AS second_movie FROM keyword AS k, link_type AS lt, movie_keyword AS mk, movie_link AS ml, title AS t1, title AS t2 WHERE k.keyword ='character-name-in-title' AND mk.keyword_id = k.id AND t1.id = mk.movie_id AND ml.movie_id = t1.id AND ml.linked_movie_id = t2.id AND lt.id = ml.link_type_id AND mk.movie_id = t1.id; diff --git a/benchmarks/queries/imdb/33a.sql b/benchmarks/queries/imdb/33a.sql new file mode 100644 index 000000000000..24aac4e20797 --- /dev/null +++ b/benchmarks/queries/imdb/33a.sql @@ -0,0 +1 @@ +SELECT MIN(cn1.name) AS first_company, MIN(cn2.name) AS second_company, MIN(mi_idx1.info) AS first_rating, MIN(mi_idx2.info) AS second_rating, MIN(t1.title) AS first_movie, MIN(t2.title) AS second_movie FROM company_name AS cn1, company_name AS cn2, info_type AS it1, info_type AS it2, kind_type AS kt1, kind_type AS kt2, link_type AS lt, movie_companies AS mc1, movie_companies AS mc2, movie_info_idx AS mi_idx1, movie_info_idx AS mi_idx2, movie_link AS ml, title AS t1, title AS t2 WHERE cn1.country_code = '[us]' AND it1.info = 'rating' AND it2.info = 'rating' AND kt1.kind in ('tv series') AND kt2.kind in ('tv series') AND lt.link in ('sequel', 'follows', 'followed by') AND mi_idx2.info < '3.0' AND t2.production_year between 2005 and 2008 AND lt.id = ml.link_type_id AND t1.id = ml.movie_id AND t2.id = ml.linked_movie_id AND it1.id = mi_idx1.info_type_id AND t1.id = mi_idx1.movie_id AND kt1.id = t1.kind_id AND cn1.id = mc1.company_id AND t1.id = mc1.movie_id AND ml.movie_id = mi_idx1.movie_id AND ml.movie_id = mc1.movie_id AND mi_idx1.movie_id = mc1.movie_id AND it2.id = mi_idx2.info_type_id AND t2.id = mi_idx2.movie_id AND kt2.id = t2.kind_id AND cn2.id = mc2.company_id AND t2.id = mc2.movie_id AND ml.linked_movie_id = mi_idx2.movie_id AND ml.linked_movie_id = mc2.movie_id AND mi_idx2.movie_id = mc2.movie_id; diff --git a/benchmarks/queries/imdb/33b.sql b/benchmarks/queries/imdb/33b.sql new file mode 100644 index 000000000000..fe6fd75a6948 --- /dev/null +++ b/benchmarks/queries/imdb/33b.sql @@ -0,0 +1 @@ +SELECT MIN(cn1.name) AS first_company, MIN(cn2.name) AS second_company, MIN(mi_idx1.info) AS first_rating, MIN(mi_idx2.info) AS second_rating, MIN(t1.title) AS first_movie, MIN(t2.title) AS second_movie FROM company_name AS cn1, company_name AS cn2, info_type AS it1, info_type AS it2, kind_type AS kt1, kind_type AS kt2, link_type AS lt, movie_companies AS mc1, movie_companies AS mc2, movie_info_idx AS mi_idx1, movie_info_idx AS mi_idx2, movie_link AS ml, title AS t1, title AS t2 WHERE cn1.country_code = '[nl]' AND it1.info = 'rating' AND it2.info = 'rating' AND kt1.kind in ('tv series') AND kt2.kind in ('tv series') AND lt.link LIKE '%follow%' AND mi_idx2.info < '3.0' AND t2.production_year = 2007 AND lt.id = ml.link_type_id AND t1.id = ml.movie_id AND t2.id = ml.linked_movie_id AND it1.id = mi_idx1.info_type_id AND t1.id = mi_idx1.movie_id AND kt1.id = t1.kind_id AND cn1.id = mc1.company_id AND t1.id = mc1.movie_id AND ml.movie_id = mi_idx1.movie_id AND ml.movie_id = mc1.movie_id AND mi_idx1.movie_id = mc1.movie_id AND it2.id = mi_idx2.info_type_id AND t2.id = mi_idx2.movie_id AND kt2.id = t2.kind_id AND cn2.id = mc2.company_id AND t2.id = mc2.movie_id AND ml.linked_movie_id = mi_idx2.movie_id AND ml.linked_movie_id = mc2.movie_id AND mi_idx2.movie_id = mc2.movie_id; diff --git a/benchmarks/queries/imdb/33c.sql b/benchmarks/queries/imdb/33c.sql new file mode 100644 index 000000000000..c9f0907d3f90 --- /dev/null +++ b/benchmarks/queries/imdb/33c.sql @@ -0,0 +1 @@ +SELECT MIN(cn1.name) AS first_company, MIN(cn2.name) AS second_company, MIN(mi_idx1.info) AS first_rating, MIN(mi_idx2.info) AS second_rating, MIN(t1.title) AS first_movie, MIN(t2.title) AS second_movie FROM company_name AS cn1, company_name AS cn2, info_type AS it1, info_type AS it2, kind_type AS kt1, kind_type AS kt2, link_type AS lt, movie_companies AS mc1, movie_companies AS mc2, movie_info_idx AS mi_idx1, movie_info_idx AS mi_idx2, movie_link AS ml, title AS t1, title AS t2 WHERE cn1.country_code != '[us]' AND it1.info = 'rating' AND it2.info = 'rating' AND kt1.kind in ('tv series', 'episode') AND kt2.kind in ('tv series', 'episode') AND lt.link in ('sequel', 'follows', 'followed by') AND mi_idx2.info < '3.5' AND t2.production_year between 2000 and 2010 AND lt.id = ml.link_type_id AND t1.id = ml.movie_id AND t2.id = ml.linked_movie_id AND it1.id = mi_idx1.info_type_id AND t1.id = mi_idx1.movie_id AND kt1.id = t1.kind_id AND cn1.id = mc1.company_id AND t1.id = mc1.movie_id AND ml.movie_id = mi_idx1.movie_id AND ml.movie_id = mc1.movie_id AND mi_idx1.movie_id = mc1.movie_id AND it2.id = mi_idx2.info_type_id AND t2.id = mi_idx2.movie_id AND kt2.id = t2.kind_id AND cn2.id = mc2.company_id AND t2.id = mc2.movie_id AND ml.linked_movie_id = mi_idx2.movie_id AND ml.linked_movie_id = mc2.movie_id AND mi_idx2.movie_id = mc2.movie_id; diff --git a/benchmarks/queries/imdb/3a.sql b/benchmarks/queries/imdb/3a.sql new file mode 100644 index 000000000000..231c957be207 --- /dev/null +++ b/benchmarks/queries/imdb/3a.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM keyword AS k, movie_info AS mi, movie_keyword AS mk, title AS t WHERE k.keyword like '%sequel%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German') AND t.production_year > 2005 AND t.id = mi.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi.movie_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/3b.sql b/benchmarks/queries/imdb/3b.sql new file mode 100644 index 000000000000..fd21efc81014 --- /dev/null +++ b/benchmarks/queries/imdb/3b.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM keyword AS k, movie_info AS mi, movie_keyword AS mk, title AS t WHERE k.keyword like '%sequel%' AND mi.info IN ('Bulgaria') AND t.production_year > 2010 AND t.id = mi.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi.movie_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/3c.sql b/benchmarks/queries/imdb/3c.sql new file mode 100644 index 000000000000..5f34232a2e61 --- /dev/null +++ b/benchmarks/queries/imdb/3c.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS movie_title FROM keyword AS k, movie_info AS mi, movie_keyword AS mk, title AS t WHERE k.keyword like '%sequel%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'USA', 'American') AND t.production_year > 1990 AND t.id = mi.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi.movie_id AND k.id = mk.keyword_id; diff --git a/benchmarks/queries/imdb/4a.sql b/benchmarks/queries/imdb/4a.sql new file mode 100644 index 000000000000..636afab02c8a --- /dev/null +++ b/benchmarks/queries/imdb/4a.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS movie_title FROM info_type AS it, keyword AS k, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it.info ='rating' AND k.keyword like '%sequel%' AND mi_idx.info > '5.0' AND t.production_year > 2005 AND t.id = mi_idx.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/4b.sql b/benchmarks/queries/imdb/4b.sql new file mode 100644 index 000000000000..ebd3e8992060 --- /dev/null +++ b/benchmarks/queries/imdb/4b.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS movie_title FROM info_type AS it, keyword AS k, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it.info ='rating' AND k.keyword like '%sequel%' AND mi_idx.info > '9.0' AND t.production_year > 2010 AND t.id = mi_idx.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/4c.sql b/benchmarks/queries/imdb/4c.sql new file mode 100644 index 000000000000..309281200f98 --- /dev/null +++ b/benchmarks/queries/imdb/4c.sql @@ -0,0 +1 @@ +SELECT MIN(mi_idx.info) AS rating, MIN(t.title) AS movie_title FROM info_type AS it, keyword AS k, movie_info_idx AS mi_idx, movie_keyword AS mk, title AS t WHERE it.info ='rating' AND k.keyword like '%sequel%' AND mi_idx.info > '2.0' AND t.production_year > 1990 AND t.id = mi_idx.movie_id AND t.id = mk.movie_id AND mk.movie_id = mi_idx.movie_id AND k.id = mk.keyword_id AND it.id = mi_idx.info_type_id; diff --git a/benchmarks/queries/imdb/5a.sql b/benchmarks/queries/imdb/5a.sql new file mode 100644 index 000000000000..04aae9881f7e --- /dev/null +++ b/benchmarks/queries/imdb/5a.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS typical_european_movie FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info AS mi, title AS t WHERE ct.kind = 'production companies' AND mc.note like '%(theatrical)%' and mc.note like '%(France)%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German') AND t.production_year > 2005 AND t.id = mi.movie_id AND t.id = mc.movie_id AND mc.movie_id = mi.movie_id AND ct.id = mc.company_type_id AND it.id = mi.info_type_id; diff --git a/benchmarks/queries/imdb/5b.sql b/benchmarks/queries/imdb/5b.sql new file mode 100644 index 000000000000..f03a519d61b3 --- /dev/null +++ b/benchmarks/queries/imdb/5b.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS american_vhs_movie FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info AS mi, title AS t WHERE ct.kind = 'production companies' AND mc.note like '%(VHS)%' and mc.note like '%(USA)%' and mc.note like '%(1994)%' AND mi.info IN ('USA', 'America') AND t.production_year > 2010 AND t.id = mi.movie_id AND t.id = mc.movie_id AND mc.movie_id = mi.movie_id AND ct.id = mc.company_type_id AND it.id = mi.info_type_id; diff --git a/benchmarks/queries/imdb/5c.sql b/benchmarks/queries/imdb/5c.sql new file mode 100644 index 000000000000..2705e7e2c7a0 --- /dev/null +++ b/benchmarks/queries/imdb/5c.sql @@ -0,0 +1 @@ +SELECT MIN(t.title) AS american_movie FROM company_type AS ct, info_type AS it, movie_companies AS mc, movie_info AS mi, title AS t WHERE ct.kind = 'production companies' AND mc.note not like '%(TV)%' and mc.note like '%(USA)%' AND mi.info IN ('Sweden', 'Norway', 'Germany', 'Denmark', 'Swedish', 'Denish', 'Norwegian', 'German', 'USA', 'American') AND t.production_year > 1990 AND t.id = mi.movie_id AND t.id = mc.movie_id AND mc.movie_id = mi.movie_id AND ct.id = mc.company_type_id AND it.id = mi.info_type_id; diff --git a/benchmarks/queries/imdb/6a.sql b/benchmarks/queries/imdb/6a.sql new file mode 100644 index 000000000000..34b3a6da5fd2 --- /dev/null +++ b/benchmarks/queries/imdb/6a.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS marvel_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword = 'marvel-cinematic-universe' AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2010 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/6b.sql b/benchmarks/queries/imdb/6b.sql new file mode 100644 index 000000000000..1233c41e66b0 --- /dev/null +++ b/benchmarks/queries/imdb/6b.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS hero_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword in ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2014 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/6c.sql b/benchmarks/queries/imdb/6c.sql new file mode 100644 index 000000000000..d1f97746e15e --- /dev/null +++ b/benchmarks/queries/imdb/6c.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS marvel_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword = 'marvel-cinematic-universe' AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2014 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/6d.sql b/benchmarks/queries/imdb/6d.sql new file mode 100644 index 000000000000..07729510a454 --- /dev/null +++ b/benchmarks/queries/imdb/6d.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS hero_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword in ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2000 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/6e.sql b/benchmarks/queries/imdb/6e.sql new file mode 100644 index 000000000000..2e77873fd81d --- /dev/null +++ b/benchmarks/queries/imdb/6e.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS marvel_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword = 'marvel-cinematic-universe' AND n.name LIKE '%Downey%Robert%' AND t.production_year > 2000 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/6f.sql b/benchmarks/queries/imdb/6f.sql new file mode 100644 index 000000000000..603901129107 --- /dev/null +++ b/benchmarks/queries/imdb/6f.sql @@ -0,0 +1 @@ +SELECT MIN(k.keyword) AS movie_keyword, MIN(n.name) AS actor_name, MIN(t.title) AS hero_movie FROM cast_info AS ci, keyword AS k, movie_keyword AS mk, name AS n, title AS t WHERE k.keyword in ('superhero', 'sequel', 'second-part', 'marvel-comics', 'based-on-comic', 'tv-special', 'fight', 'violence') AND t.production_year > 2000 AND k.id = mk.keyword_id AND t.id = mk.movie_id AND t.id = ci.movie_id AND ci.movie_id = mk.movie_id AND n.id = ci.person_id; diff --git a/benchmarks/queries/imdb/7a.sql b/benchmarks/queries/imdb/7a.sql new file mode 100644 index 000000000000..c6b26ce36f11 --- /dev/null +++ b/benchmarks/queries/imdb/7a.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS of_person, MIN(t.title) AS biography_movie FROM aka_name AS an, cast_info AS ci, info_type AS it, link_type AS lt, movie_link AS ml, name AS n, person_info AS pi, title AS t WHERE an.name LIKE '%a%' AND it.info ='mini biography' AND lt.link ='features' AND n.name_pcode_cf BETWEEN 'A' AND 'F' AND (n.gender='m' OR (n.gender = 'f' AND n.name LIKE 'B%')) AND pi.note ='Volker Boehm' AND t.production_year BETWEEN 1980 AND 1995 AND n.id = an.person_id AND n.id = pi.person_id AND ci.person_id = n.id AND t.id = ci.movie_id AND ml.linked_movie_id = t.id AND lt.id = ml.link_type_id AND it.id = pi.info_type_id AND pi.person_id = an.person_id AND pi.person_id = ci.person_id AND an.person_id = ci.person_id AND ci.movie_id = ml.linked_movie_id; diff --git a/benchmarks/queries/imdb/7b.sql b/benchmarks/queries/imdb/7b.sql new file mode 100644 index 000000000000..4e4f6e7615cb --- /dev/null +++ b/benchmarks/queries/imdb/7b.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS of_person, MIN(t.title) AS biography_movie FROM aka_name AS an, cast_info AS ci, info_type AS it, link_type AS lt, movie_link AS ml, name AS n, person_info AS pi, title AS t WHERE an.name LIKE '%a%' AND it.info ='mini biography' AND lt.link ='features' AND n.name_pcode_cf LIKE 'D%' AND n.gender='m' AND pi.note ='Volker Boehm' AND t.production_year BETWEEN 1980 AND 1984 AND n.id = an.person_id AND n.id = pi.person_id AND ci.person_id = n.id AND t.id = ci.movie_id AND ml.linked_movie_id = t.id AND lt.id = ml.link_type_id AND it.id = pi.info_type_id AND pi.person_id = an.person_id AND pi.person_id = ci.person_id AND an.person_id = ci.person_id AND ci.movie_id = ml.linked_movie_id; diff --git a/benchmarks/queries/imdb/7c.sql b/benchmarks/queries/imdb/7c.sql new file mode 100644 index 000000000000..a399342fae02 --- /dev/null +++ b/benchmarks/queries/imdb/7c.sql @@ -0,0 +1 @@ +SELECT MIN(n.name) AS cast_member_name, MIN(pi.info) AS cast_member_info FROM aka_name AS an, cast_info AS ci, info_type AS it, link_type AS lt, movie_link AS ml, name AS n, person_info AS pi, title AS t WHERE an.name is not NULL and (an.name LIKE '%a%' or an.name LIKE 'A%') AND it.info ='mini biography' AND lt.link in ('references', 'referenced in', 'features', 'featured in') AND n.name_pcode_cf BETWEEN 'A' AND 'F' AND (n.gender='m' OR (n.gender = 'f' AND n.name LIKE 'A%')) AND pi.note is not NULL AND t.production_year BETWEEN 1980 AND 2010 AND n.id = an.person_id AND n.id = pi.person_id AND ci.person_id = n.id AND t.id = ci.movie_id AND ml.linked_movie_id = t.id AND lt.id = ml.link_type_id AND it.id = pi.info_type_id AND pi.person_id = an.person_id AND pi.person_id = ci.person_id AND an.person_id = ci.person_id AND ci.movie_id = ml.linked_movie_id; diff --git a/benchmarks/queries/imdb/8a.sql b/benchmarks/queries/imdb/8a.sql new file mode 100644 index 000000000000..66ed05880d5f --- /dev/null +++ b/benchmarks/queries/imdb/8a.sql @@ -0,0 +1 @@ +SELECT MIN(an1.name) AS actress_pseudonym, MIN(t.title) AS japanese_movie_dubbed FROM aka_name AS an1, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n1, role_type AS rt, title AS t WHERE ci.note ='(voice: English version)' AND cn.country_code ='[jp]' AND mc.note like '%(Japan)%' and mc.note not like '%(USA)%' AND n1.name like '%Yo%' and n1.name not like '%Yu%' AND rt.role ='actress' AND an1.person_id = n1.id AND n1.id = ci.person_id AND ci.movie_id = t.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND an1.person_id = ci.person_id AND ci.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/8b.sql b/benchmarks/queries/imdb/8b.sql new file mode 100644 index 000000000000..044b5f8e8649 --- /dev/null +++ b/benchmarks/queries/imdb/8b.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS acress_pseudonym, MIN(t.title) AS japanese_anime_movie FROM aka_name AS an, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n, role_type AS rt, title AS t WHERE ci.note ='(voice: English version)' AND cn.country_code ='[jp]' AND mc.note like '%(Japan)%' and mc.note not like '%(USA)%' and (mc.note like '%(2006)%' or mc.note like '%(2007)%') AND n.name like '%Yo%' and n.name not like '%Yu%' AND rt.role ='actress' AND t.production_year between 2006 and 2007 and (t.title like 'One Piece%' or t.title like 'Dragon Ball Z%') AND an.person_id = n.id AND n.id = ci.person_id AND ci.movie_id = t.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND an.person_id = ci.person_id AND ci.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/8c.sql b/benchmarks/queries/imdb/8c.sql new file mode 100644 index 000000000000..d02b74c02c5e --- /dev/null +++ b/benchmarks/queries/imdb/8c.sql @@ -0,0 +1 @@ +SELECT MIN(a1.name) AS writer_pseudo_name, MIN(t.title) AS movie_title FROM aka_name AS a1, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n1, role_type AS rt, title AS t WHERE cn.country_code ='[us]' AND rt.role ='writer' AND a1.person_id = n1.id AND n1.id = ci.person_id AND ci.movie_id = t.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND a1.person_id = ci.person_id AND ci.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/8d.sql b/benchmarks/queries/imdb/8d.sql new file mode 100644 index 000000000000..0834c0ff5cb7 --- /dev/null +++ b/benchmarks/queries/imdb/8d.sql @@ -0,0 +1 @@ +SELECT MIN(an1.name) AS costume_designer_pseudo, MIN(t.title) AS movie_with_costumes FROM aka_name AS an1, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n1, role_type AS rt, title AS t WHERE cn.country_code ='[us]' AND rt.role ='costume designer' AND an1.person_id = n1.id AND n1.id = ci.person_id AND ci.movie_id = t.id AND t.id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND an1.person_id = ci.person_id AND ci.movie_id = mc.movie_id; diff --git a/benchmarks/queries/imdb/9a.sql b/benchmarks/queries/imdb/9a.sql new file mode 100644 index 000000000000..593b16213b06 --- /dev/null +++ b/benchmarks/queries/imdb/9a.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS alternative_name, MIN(chn.name) AS character_name, MIN(t.title) AS movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND mc.note is not NULL and (mc.note like '%(USA)%' or mc.note like '%(worldwide)%') AND n.gender ='f' and n.name like '%Ang%' AND rt.role ='actress' AND t.production_year between 2005 and 2015 AND ci.movie_id = t.id AND t.id = mc.movie_id AND ci.movie_id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND n.id = ci.person_id AND chn.id = ci.person_role_id AND an.person_id = n.id AND an.person_id = ci.person_id; diff --git a/benchmarks/queries/imdb/9b.sql b/benchmarks/queries/imdb/9b.sql new file mode 100644 index 000000000000..a4933fd6856e --- /dev/null +++ b/benchmarks/queries/imdb/9b.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS alternative_name, MIN(chn.name) AS voiced_character, MIN(n.name) AS voicing_actress, MIN(t.title) AS american_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n, role_type AS rt, title AS t WHERE ci.note = '(voice)' AND cn.country_code ='[us]' AND mc.note like '%(200%)%' and (mc.note like '%(USA)%' or mc.note like '%(worldwide)%') AND n.gender ='f' and n.name like '%Angel%' AND rt.role ='actress' AND t.production_year between 2007 and 2010 AND ci.movie_id = t.id AND t.id = mc.movie_id AND ci.movie_id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND n.id = ci.person_id AND chn.id = ci.person_role_id AND an.person_id = n.id AND an.person_id = ci.person_id; diff --git a/benchmarks/queries/imdb/9c.sql b/benchmarks/queries/imdb/9c.sql new file mode 100644 index 000000000000..0be511810cf6 --- /dev/null +++ b/benchmarks/queries/imdb/9c.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS alternative_name, MIN(chn.name) AS voiced_character_name, MIN(n.name) AS voicing_actress, MIN(t.title) AS american_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND n.gender ='f' and n.name like '%An%' AND rt.role ='actress' AND ci.movie_id = t.id AND t.id = mc.movie_id AND ci.movie_id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND n.id = ci.person_id AND chn.id = ci.person_role_id AND an.person_id = n.id AND an.person_id = ci.person_id; diff --git a/benchmarks/queries/imdb/9d.sql b/benchmarks/queries/imdb/9d.sql new file mode 100644 index 000000000000..51262ca5ebae --- /dev/null +++ b/benchmarks/queries/imdb/9d.sql @@ -0,0 +1 @@ +SELECT MIN(an.name) AS alternative_name, MIN(chn.name) AS voiced_char_name, MIN(n.name) AS voicing_actress, MIN(t.title) AS american_movie FROM aka_name AS an, char_name AS chn, cast_info AS ci, company_name AS cn, movie_companies AS mc, name AS n, role_type AS rt, title AS t WHERE ci.note in ('(voice)', '(voice: Japanese version)', '(voice) (uncredited)', '(voice: English version)') AND cn.country_code ='[us]' AND n.gender ='f' AND rt.role ='actress' AND ci.movie_id = t.id AND t.id = mc.movie_id AND ci.movie_id = mc.movie_id AND mc.company_id = cn.id AND ci.role_id = rt.id AND n.id = ci.person_id AND chn.id = ci.person_role_id AND an.person_id = n.id AND an.person_id = ci.person_id; diff --git a/benchmarks/src/bin/dfbench.rs b/benchmarks/src/bin/dfbench.rs index 9ce6848a063a..f7b84116e793 100644 --- a/benchmarks/src/bin/dfbench.rs +++ b/benchmarks/src/bin/dfbench.rs @@ -33,7 +33,7 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; -use datafusion_benchmarks::{clickbench, parquet_filter, sort, tpch}; +use datafusion_benchmarks::{clickbench, imdb, parquet_filter, sort, tpch}; #[derive(Debug, StructOpt)] #[structopt(about = "benchmark command")] @@ -43,6 +43,7 @@ enum Options { Clickbench(clickbench::RunOpt), ParquetFilter(parquet_filter::RunOpt), Sort(sort::RunOpt), + Imdb(imdb::RunOpt), } // Main benchmark runner entrypoint @@ -56,5 +57,6 @@ pub async fn main() -> Result<()> { Options::Clickbench(opt) => opt.run().await, Options::ParquetFilter(opt) => opt.run().await, Options::Sort(opt) => opt.run().await, + Options::Imdb(opt) => opt.run().await, } } diff --git a/benchmarks/src/bin/imdb.rs b/benchmarks/src/bin/imdb.rs index 40efb84b0501..13421f8a89a9 100644 --- a/benchmarks/src/bin/imdb.rs +++ b/benchmarks/src/bin/imdb.rs @@ -34,9 +34,17 @@ static ALLOC: snmalloc_rs::SnMalloc = snmalloc_rs::SnMalloc; #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; +#[derive(Debug, StructOpt)] +#[structopt(about = "benchmark command")] +enum BenchmarkSubCommandOpt { + #[structopt(name = "datafusion")] + DataFusionBenchmark(imdb::RunOpt), +} + #[derive(Debug, StructOpt)] #[structopt(name = "IMDB", about = "IMDB Dataset Processing.")] enum ImdbOpt { + Benchmark(BenchmarkSubCommandOpt), Convert(imdb::ConvertOpt), } @@ -44,6 +52,9 @@ enum ImdbOpt { pub async fn main() -> Result<()> { env_logger::init(); match ImdbOpt::from_args() { + ImdbOpt::Benchmark(BenchmarkSubCommandOpt::DataFusionBenchmark(opt)) => { + opt.run().await + } ImdbOpt::Convert(opt) => opt.run().await, } } diff --git a/benchmarks/src/imdb/convert.rs b/benchmarks/src/imdb/convert.rs index c95f7f8bf564..4e470d711da5 100644 --- a/benchmarks/src/imdb/convert.rs +++ b/benchmarks/src/imdb/convert.rs @@ -51,11 +51,12 @@ impl ConvertOpt { pub async fn run(self) -> Result<()> { let input_path = self.input_path.to_str().unwrap(); let output_path = self.output_path.to_str().unwrap(); + let config = SessionConfig::new().with_batch_size(self.batch_size); + let ctx = SessionContext::new_with_config(config); for table in IMDB_TABLES { let start = Instant::now(); let schema = get_imdb_table_schema(table); - let input_path = format!("{input_path}/{table}.csv"); let output_path = format!("{output_path}/{table}.parquet"); let options = CsvReadOptions::new() @@ -65,9 +66,6 @@ impl ConvertOpt { .escape(b'\\') .file_extension(".csv"); - let config = SessionConfig::new().with_batch_size(self.batch_size); - let ctx = SessionContext::new_with_config(config); - let mut csv = ctx.read_csv(&input_path, options).await?; // Select all apart from the padding column diff --git a/benchmarks/src/imdb/mod.rs b/benchmarks/src/imdb/mod.rs index 8e2977c0384e..6a45242e6ff4 100644 --- a/benchmarks/src/imdb/mod.rs +++ b/benchmarks/src/imdb/mod.rs @@ -17,10 +17,18 @@ //! Benchmark derived from IMDB dataset. -use datafusion::arrow::datatypes::{DataType, Field, Schema}; +use datafusion::{ + arrow::datatypes::{DataType, Field, Schema}, + common::plan_err, + error::Result, +}; mod convert; pub use convert::ConvertOpt; +use std::fs; +mod run; +pub use run::RunOpt; + // we have 21 tables in the IMDB dataset pub const IMDB_TABLES: &[&str] = &[ "aka_name", @@ -51,7 +59,7 @@ pub const IMDB_TABLES: &[&str] = &[ pub fn get_imdb_table_schema(table: &str) -> Schema { match table { "aka_name" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("person_id", DataType::Int32, false), Field::new("name", DataType::Utf8, true), Field::new("imdb_index", DataType::Utf8, true), @@ -61,7 +69,7 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "aka_title" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("title", DataType::Utf8, true), Field::new("imdb_index", DataType::Utf8, true), @@ -75,7 +83,7 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "cast_info" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("person_id", DataType::Int32, false), Field::new("movie_id", DataType::Int32, false), Field::new("person_role_id", DataType::Int32, true), @@ -84,7 +92,7 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("role_id", DataType::Int32, false), ]), "char_name" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("name", DataType::Utf8, false), Field::new("imdb_index", DataType::Utf8, true), Field::new("imdb_id", DataType::Int32, true), @@ -93,11 +101,11 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "comp_cast_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("kind", DataType::Utf8, false), ]), "company_name" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("name", DataType::Utf8, false), Field::new("country_code", DataType::Utf8, true), Field::new("imdb_id", DataType::Int32, true), @@ -106,59 +114,59 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "company_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("kind", DataType::Utf8, true), ]), "complete_cast" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, true), Field::new("subject_id", DataType::Int32, false), Field::new("status_id", DataType::Int32, false), ]), "info_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("info", DataType::Utf8, false), ]), "keyword" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("keyword", DataType::Utf8, false), Field::new("phonetic_code", DataType::Utf8, true), ]), "kind_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("kind", DataType::Utf8, true), ]), "link_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("link", DataType::Utf8, false), ]), "movie_companies" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("company_id", DataType::Int32, false), Field::new("company_type_id", DataType::Int32, false), Field::new("note", DataType::Utf8, true), ]), "movie_info_idx" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("info_type_id", DataType::Int32, false), Field::new("info", DataType::Utf8, false), Field::new("note", DataType::Utf8, true), ]), "movie_keyword" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("keyword_id", DataType::Int32, false), ]), "movie_link" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("linked_movie_id", DataType::Int32, false), Field::new("link_type_id", DataType::Int32, false), ]), "name" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("name", DataType::Utf8, false), Field::new("imdb_index", DataType::Utf8, true), Field::new("imdb_id", DataType::Int32, true), @@ -169,11 +177,11 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "role_type" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("role", DataType::Utf8, false), ]), "title" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("title", DataType::Utf8, false), Field::new("imdb_index", DataType::Utf8, true), Field::new("kind_id", DataType::Int32, false), @@ -187,14 +195,14 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { Field::new("md5sum", DataType::Utf8, true), ]), "movie_info" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("movie_id", DataType::Int32, false), Field::new("info_type_id", DataType::Int32, false), Field::new("info", DataType::Utf8, false), Field::new("note", DataType::Utf8, true), ]), "person_info" => Schema::new(vec![ - Field::new("id", DataType::Int32, false), + Field::new("id", DataType::UInt32, false), Field::new("person_id", DataType::Int32, false), Field::new("info_type_id", DataType::Int32, false), Field::new("info", DataType::Utf8, false), @@ -203,3 +211,26 @@ pub fn get_imdb_table_schema(table: &str) -> Schema { _ => unimplemented!("Schema for table {} is not implemented", table), } } + +/// Get the SQL statements from the specified query file +pub fn get_query_sql(query: &str) -> Result> { + let possibilities = vec![ + format!("queries/imdb/{query}.sql"), + format!("benchmarks/queries/imdb/{query}.sql"), + ]; + let mut errors = vec![]; + for filename in possibilities { + match fs::read_to_string(&filename) { + Ok(contents) => { + return Ok(contents + .split(';') + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + .map(|s| s.to_string()) + .collect()); + } + Err(e) => errors.push(format!("{filename}: {e}")), + }; + } + plan_err!("invalid query. Could not find query: {:?}", errors) +} diff --git a/benchmarks/src/imdb/run.rs b/benchmarks/src/imdb/run.rs new file mode 100644 index 000000000000..697c79dc894a --- /dev/null +++ b/benchmarks/src/imdb/run.rs @@ -0,0 +1,827 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::path::PathBuf; +use std::sync::Arc; + +use super::{get_imdb_table_schema, get_query_sql, IMDB_TABLES}; +use crate::{BenchmarkRun, CommonOpt}; + +use arrow::record_batch::RecordBatch; +use arrow::util::pretty::{self, pretty_format_batches}; +use datafusion::datasource::file_format::csv::CsvFormat; +use datafusion::datasource::file_format::parquet::ParquetFormat; +use datafusion::datasource::file_format::FileFormat; +use datafusion::datasource::listing::{ + ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, +}; +use datafusion::datasource::{MemTable, TableProvider}; +use datafusion::error::Result; +use datafusion::physical_plan::display::DisplayableExecutionPlan; +use datafusion::physical_plan::{collect, displayable}; +use datafusion::prelude::*; +use datafusion_common::instant::Instant; +use datafusion_common::{DEFAULT_CSV_EXTENSION, DEFAULT_PARQUET_EXTENSION}; + +use log::info; +use structopt::StructOpt; + +// hack to avoid `default_value is meaningless for bool` errors +type BoolDefaultTrue = bool; + +/// Run the imdb benchmark (a.k.a. JOB). +/// +/// This benchmarks is derived from the [Join Order Benchmark / JOB] proposed in paper [How Good Are Query Optimizers, Really?][1]. +/// The data and answers are downloaded from +/// [2] and [3]. +/// +/// [1]: https://www.vldb.org/pvldb/vol9/p204-leis.pdf +/// [2]: http://homepages.cwi.nl/~boncz/job/imdb.tgz +/// [3]: https://db.in.tum.de/~leis/qo/job.tgz + +#[derive(Debug, StructOpt, Clone)] +#[structopt(verbatim_doc_comment)] +pub struct RunOpt { + /// Query number. If not specified, runs all queries + #[structopt(short, long)] + query: Option, + + /// Common options + #[structopt(flatten)] + common: CommonOpt, + + /// Path to data files + #[structopt(parse(from_os_str), required = true, short = "p", long = "path")] + path: PathBuf, + + /// File format: `csv` or `parquet` + #[structopt(short = "f", long = "format", default_value = "csv")] + file_format: String, + + /// Load the data into a MemTable before executing the query + #[structopt(short = "m", long = "mem-table")] + mem_table: bool, + + /// Path to machine readable output file + #[structopt(parse(from_os_str), short = "o", long = "output")] + output_path: Option, + + /// Whether to disable collection of statistics (and cost based optimizations) or not. + #[structopt(short = "S", long = "disable-statistics")] + disable_statistics: bool, + + /// If true then hash join used, if false then sort merge join + /// True by default. + #[structopt(short = "j", long = "prefer_hash_join", default_value = "true")] + prefer_hash_join: BoolDefaultTrue, +} + +const IMDB_QUERY_START_ID: usize = 1; +const IMDB_QUERY_END_ID: usize = 113; + +fn map_query_id_to_str(query_id: usize) -> &'static str { + match query_id { + // 1 + 1 => "1a", + 2 => "1b", + 3 => "1c", + 4 => "1d", + + // 2 + 5 => "2a", + 6 => "2b", + 7 => "2c", + 8 => "2d", + + // 3 + 9 => "3a", + 10 => "3b", + 11 => "3c", + + // 4 + 12 => "4a", + 13 => "4b", + 14 => "4c", + + // 5 + 15 => "5a", + 16 => "5b", + 17 => "5c", + + // 6 + 18 => "6a", + 19 => "6b", + 20 => "6c", + 21 => "6d", + 22 => "6e", + 23 => "6f", + + // 7 + 24 => "7a", + 25 => "7b", + 26 => "7c", + + // 8 + 27 => "8a", + 28 => "8b", + 29 => "8c", + 30 => "8d", + + // 9 + 31 => "9a", + 32 => "9b", + 33 => "9c", + 34 => "9d", + + // 10 + 35 => "10a", + 36 => "10b", + 37 => "10c", + + // 11 + 38 => "11a", + 39 => "11b", + 40 => "11c", + 41 => "11d", + + // 12 + 42 => "12a", + 43 => "12b", + 44 => "12c", + + // 13 + 45 => "13a", + 46 => "13b", + 47 => "13c", + 48 => "13d", + + // 14 + 49 => "14a", + 50 => "14b", + 51 => "14c", + + // 15 + 52 => "15a", + 53 => "15b", + 54 => "15c", + 55 => "15d", + + // 16 + 56 => "16a", + 57 => "16b", + 58 => "16c", + 59 => "16d", + + // 17 + 60 => "17a", + 61 => "17b", + 62 => "17c", + 63 => "17d", + 64 => "17e", + 65 => "17f", + + // 18 + 66 => "18a", + 67 => "18b", + 68 => "18c", + + // 19 + 69 => "19a", + 70 => "19b", + 71 => "19c", + 72 => "19d", + + // 20 + 73 => "20a", + 74 => "20b", + 75 => "20c", + + // 21 + 76 => "21a", + 77 => "21b", + 78 => "21c", + + // 22 + 79 => "22a", + 80 => "22b", + 81 => "22c", + 82 => "22d", + + // 23 + 83 => "23a", + 84 => "23b", + 85 => "23c", + + // 24 + 86 => "24a", + 87 => "24b", + + // 25 + 88 => "25a", + 89 => "25b", + 90 => "25c", + + // 26 + 91 => "26a", + 92 => "26b", + 93 => "26c", + + // 27 + 94 => "27a", + 95 => "27b", + 96 => "27c", + + // 28 + 97 => "28a", + 98 => "28b", + 99 => "28c", + + // 29 + 100 => "29a", + 101 => "29b", + 102 => "29c", + + // 30 + 103 => "30a", + 104 => "30b", + 105 => "30c", + + // 31 + 106 => "31a", + 107 => "31b", + 108 => "31c", + + // 32 + 109 => "32a", + 110 => "32b", + + // 33 + 111 => "33a", + 112 => "33b", + 113 => "33c", + + // Fallback for unknown query_id + _ => "unknown", + } +} + +impl RunOpt { + pub async fn run(self) -> Result<()> { + println!("Running benchmarks with the following options: {self:?}"); + let query_range = match self.query { + Some(query_id) => query_id..=query_id, + None => IMDB_QUERY_START_ID..=IMDB_QUERY_END_ID, + }; + + let mut benchmark_run = BenchmarkRun::new(); + for query_id in query_range { + benchmark_run.start_new_case(&format!("Query {query_id}")); + let query_run = self.benchmark_query(query_id).await?; + for iter in query_run { + benchmark_run.write_iter(iter.elapsed, iter.row_count); + } + } + benchmark_run.maybe_write_json(self.output_path.as_ref())?; + Ok(()) + } + + async fn benchmark_query(&self, query_id: usize) -> Result> { + let mut config = self + .common + .config() + .with_collect_statistics(!self.disable_statistics); + config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join; + config + .options_mut() + .execution + .parquet + .schema_force_view_types = self.common.force_view_types; + let ctx = SessionContext::new_with_config(config); + + // register tables + self.register_tables(&ctx).await?; + + let mut millis = vec![]; + // run benchmark + let mut query_results = vec![]; + for i in 0..self.iterations() { + let start = Instant::now(); + + let query_id_str = map_query_id_to_str(query_id); + let sql = &get_query_sql(query_id_str)?; + + let mut result = vec![]; + + for query in sql { + result = self.execute_query(&ctx, query).await?; + } + + let elapsed = start.elapsed(); //.as_secs_f64() * 1000.0; + let ms = elapsed.as_secs_f64() * 1000.0; + millis.push(ms); + info!("output:\n\n{}\n\n", pretty_format_batches(&result)?); + let row_count = result.iter().map(|b| b.num_rows()).sum(); + println!( + "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows" + ); + query_results.push(QueryResult { elapsed, row_count }); + } + + let avg = millis.iter().sum::() / millis.len() as f64; + println!("Query {query_id} avg time: {avg:.2} ms"); + + Ok(query_results) + } + + async fn register_tables(&self, ctx: &SessionContext) -> Result<()> { + for table in IMDB_TABLES { + let table_provider = { self.get_table(ctx, table).await? }; + + if self.mem_table { + println!("Loading table '{table}' into memory"); + let start = Instant::now(); + let memtable = + MemTable::load(table_provider, Some(self.partitions()), &ctx.state()) + .await?; + println!( + "Loaded table '{}' into memory in {} ms", + table, + start.elapsed().as_millis() + ); + ctx.register_table(*table, Arc::new(memtable))?; + } else { + ctx.register_table(*table, table_provider)?; + } + } + Ok(()) + } + + async fn execute_query( + &self, + ctx: &SessionContext, + sql: &str, + ) -> Result> { + let debug = self.common.debug; + let plan = ctx.sql(sql).await?; + let (state, plan) = plan.into_parts(); + + if debug { + println!("=== Logical plan ===\n{plan}\n"); + } + + let plan = state.optimize(&plan)?; + if debug { + println!("=== Optimized logical plan ===\n{plan}\n"); + } + let physical_plan = state.create_physical_plan(&plan).await?; + if debug { + println!( + "=== Physical plan ===\n{}\n", + displayable(physical_plan.as_ref()).indent(true) + ); + } + let result = collect(physical_plan.clone(), state.task_ctx()).await?; + if debug { + println!( + "=== Physical plan with metrics ===\n{}\n", + DisplayableExecutionPlan::with_metrics(physical_plan.as_ref()) + .indent(true) + ); + if !result.is_empty() { + // do not call print_batches if there are no batches as the result is confusing + // and makes it look like there is a batch with no columns + pretty::print_batches(&result)?; + } + } + Ok(result) + } + + async fn get_table( + &self, + ctx: &SessionContext, + table: &str, + ) -> Result> { + let path = self.path.to_str().unwrap(); + let table_format = self.file_format.as_str(); + + // Obtain a snapshot of the SessionState + let state = ctx.state(); + let (format, path, extension): (Arc, String, &'static str) = + match table_format { + // dbgen creates .tbl ('|' delimited) files without header + "tbl" => { + let path = format!("{path}/{table}.tbl"); + + let format = CsvFormat::default() + .with_delimiter(b'|') + .with_has_header(false); + + (Arc::new(format), path, ".tbl") + } + "csv" => { + let path = format!("{path}/{table}.csv"); + let format = CsvFormat::default() + .with_delimiter(b',') + .with_escape(Some(b'\\')) + .with_has_header(false); + + (Arc::new(format), path, DEFAULT_CSV_EXTENSION) + } + "parquet" => { + let path = format!("{path}/{table}.parquet"); + let format = ParquetFormat::default() + .with_options(ctx.state().table_options().parquet.clone()); + (Arc::new(format), path, DEFAULT_PARQUET_EXTENSION) + } + other => { + unimplemented!("Invalid file format '{}'", other); + } + }; + + let options = ListingOptions::new(format) + .with_file_extension(extension) + .with_collect_stat(state.config().collect_statistics()); + + let table_path = ListingTableUrl::parse(path)?; + let config = ListingTableConfig::new(table_path).with_listing_options(options); + let config = match table_format { + "parquet" => config.with_schema(Arc::new(get_imdb_table_schema(table))), + "csv" => config.with_schema(Arc::new(get_imdb_table_schema(table))), + _ => unreachable!(), + }; + + Ok(Arc::new(ListingTable::try_new(config)?)) + } + + fn iterations(&self) -> usize { + self.common.iterations + } + + fn partitions(&self) -> usize { + self.common.partitions.unwrap_or(num_cpus::get()) + } +} + +struct QueryResult { + elapsed: std::time::Duration, + row_count: usize, +} + +#[cfg(test)] +// Only run with "ci" mode when we have the data +#[cfg(feature = "ci")] +mod tests { + use std::path::Path; + + use super::*; + + use datafusion::common::exec_err; + use datafusion::error::Result; + use datafusion_proto::bytes::{ + logical_plan_from_bytes, logical_plan_to_bytes, physical_plan_from_bytes, + physical_plan_to_bytes, + }; + + fn get_imdb_data_path() -> Result { + let path = + std::env::var("IMDB_DATA").unwrap_or_else(|_| "benchmarks/data".to_string()); + if !Path::new(&path).exists() { + return exec_err!( + "Benchmark data not found (set IMDB_DATA env var to override): {}", + path + ); + } + Ok(path) + } + + async fn round_trip_logical_plan(query: usize) -> Result<()> { + let ctx = SessionContext::default(); + let path = get_imdb_data_path()?; + let common = CommonOpt { + iterations: 1, + partitions: Some(2), + batch_size: 8192, + debug: false, + force_view_types: false, + }; + let opt = RunOpt { + query: Some(query), + common, + path: PathBuf::from(path.to_string()), + file_format: "parquet".to_string(), + mem_table: false, + output_path: None, + disable_statistics: false, + prefer_hash_join: true, + }; + opt.register_tables(&ctx).await?; + let queries = get_query_sql(map_query_id_to_str(query))?; + for query in queries { + let plan = ctx.sql(&query).await?; + let plan = plan.into_optimized_plan()?; + let bytes = logical_plan_to_bytes(&plan)?; + let plan2 = logical_plan_from_bytes(&bytes, &ctx)?; + let plan_formatted = format!("{}", plan.display_indent()); + let plan2_formatted = format!("{}", plan2.display_indent()); + assert_eq!(plan_formatted, plan2_formatted); + } + Ok(()) + } + + async fn round_trip_physical_plan(query: usize) -> Result<()> { + let ctx = SessionContext::default(); + let path = get_imdb_data_path()?; + let common = CommonOpt { + iterations: 1, + partitions: Some(2), + batch_size: 8192, + debug: false, + force_view_types: false, + }; + let opt = RunOpt { + query: Some(query), + common, + path: PathBuf::from(path.to_string()), + file_format: "parquet".to_string(), + mem_table: false, + output_path: None, + disable_statistics: false, + prefer_hash_join: true, + }; + opt.register_tables(&ctx).await?; + let queries = get_query_sql(map_query_id_to_str(query))?; + for query in queries { + let plan = ctx.sql(&query).await?; + let plan = plan.create_physical_plan().await?; + let bytes = physical_plan_to_bytes(plan.clone())?; + let plan2 = physical_plan_from_bytes(&bytes, &ctx)?; + let plan_formatted = format!("{}", displayable(plan.as_ref()).indent(false)); + let plan2_formatted = + format!("{}", displayable(plan2.as_ref()).indent(false)); + assert_eq!(plan_formatted, plan2_formatted); + } + Ok(()) + } + + macro_rules! test_round_trip_logical { + ($tn:ident, $query:expr) => { + #[tokio::test] + async fn $tn() -> Result<()> { + round_trip_logical_plan($query).await + } + }; + } + + macro_rules! test_round_trip_physical { + ($tn:ident, $query:expr) => { + #[tokio::test] + async fn $tn() -> Result<()> { + round_trip_physical_plan($query).await + } + }; + } + + // logical plan tests + test_round_trip_logical!(round_trip_logical_plan_1a, 1); + test_round_trip_logical!(round_trip_logical_plan_1b, 2); + test_round_trip_logical!(round_trip_logical_plan_1c, 3); + test_round_trip_logical!(round_trip_logical_plan_1d, 4); + test_round_trip_logical!(round_trip_logical_plan_2a, 5); + test_round_trip_logical!(round_trip_logical_plan_2b, 6); + test_round_trip_logical!(round_trip_logical_plan_2c, 7); + test_round_trip_logical!(round_trip_logical_plan_2d, 8); + test_round_trip_logical!(round_trip_logical_plan_3a, 9); + test_round_trip_logical!(round_trip_logical_plan_3b, 10); + test_round_trip_logical!(round_trip_logical_plan_3c, 11); + test_round_trip_logical!(round_trip_logical_plan_4a, 12); + test_round_trip_logical!(round_trip_logical_plan_4b, 13); + test_round_trip_logical!(round_trip_logical_plan_4c, 14); + test_round_trip_logical!(round_trip_logical_plan_5a, 15); + test_round_trip_logical!(round_trip_logical_plan_5b, 16); + test_round_trip_logical!(round_trip_logical_plan_5c, 17); + test_round_trip_logical!(round_trip_logical_plan_6a, 18); + test_round_trip_logical!(round_trip_logical_plan_6b, 19); + test_round_trip_logical!(round_trip_logical_plan_6c, 20); + test_round_trip_logical!(round_trip_logical_plan_6d, 21); + test_round_trip_logical!(round_trip_logical_plan_6e, 22); + test_round_trip_logical!(round_trip_logical_plan_6f, 23); + test_round_trip_logical!(round_trip_logical_plan_7a, 24); + test_round_trip_logical!(round_trip_logical_plan_7b, 25); + test_round_trip_logical!(round_trip_logical_plan_7c, 26); + test_round_trip_logical!(round_trip_logical_plan_8a, 27); + test_round_trip_logical!(round_trip_logical_plan_8b, 28); + test_round_trip_logical!(round_trip_logical_plan_8c, 29); + test_round_trip_logical!(round_trip_logical_plan_8d, 30); + test_round_trip_logical!(round_trip_logical_plan_9a, 31); + test_round_trip_logical!(round_trip_logical_plan_9b, 32); + test_round_trip_logical!(round_trip_logical_plan_9c, 33); + test_round_trip_logical!(round_trip_logical_plan_9d, 34); + test_round_trip_logical!(round_trip_logical_plan_10a, 35); + test_round_trip_logical!(round_trip_logical_plan_10b, 36); + test_round_trip_logical!(round_trip_logical_plan_10c, 37); + test_round_trip_logical!(round_trip_logical_plan_11a, 38); + test_round_trip_logical!(round_trip_logical_plan_11b, 39); + test_round_trip_logical!(round_trip_logical_plan_11c, 40); + test_round_trip_logical!(round_trip_logical_plan_11d, 41); + test_round_trip_logical!(round_trip_logical_plan_12a, 42); + test_round_trip_logical!(round_trip_logical_plan_12b, 43); + test_round_trip_logical!(round_trip_logical_plan_12c, 44); + test_round_trip_logical!(round_trip_logical_plan_13a, 45); + test_round_trip_logical!(round_trip_logical_plan_13b, 46); + test_round_trip_logical!(round_trip_logical_plan_13c, 47); + test_round_trip_logical!(round_trip_logical_plan_13d, 48); + test_round_trip_logical!(round_trip_logical_plan_14a, 49); + test_round_trip_logical!(round_trip_logical_plan_14b, 50); + test_round_trip_logical!(round_trip_logical_plan_14c, 51); + test_round_trip_logical!(round_trip_logical_plan_15a, 52); + test_round_trip_logical!(round_trip_logical_plan_15b, 53); + test_round_trip_logical!(round_trip_logical_plan_15c, 54); + test_round_trip_logical!(round_trip_logical_plan_15d, 55); + test_round_trip_logical!(round_trip_logical_plan_16a, 56); + test_round_trip_logical!(round_trip_logical_plan_16b, 57); + test_round_trip_logical!(round_trip_logical_plan_16c, 58); + test_round_trip_logical!(round_trip_logical_plan_16d, 59); + test_round_trip_logical!(round_trip_logical_plan_17a, 60); + test_round_trip_logical!(round_trip_logical_plan_17b, 61); + test_round_trip_logical!(round_trip_logical_plan_17c, 62); + test_round_trip_logical!(round_trip_logical_plan_17d, 63); + test_round_trip_logical!(round_trip_logical_plan_17e, 64); + test_round_trip_logical!(round_trip_logical_plan_17f, 65); + test_round_trip_logical!(round_trip_logical_plan_18a, 66); + test_round_trip_logical!(round_trip_logical_plan_18b, 67); + test_round_trip_logical!(round_trip_logical_plan_18c, 68); + test_round_trip_logical!(round_trip_logical_plan_19a, 69); + test_round_trip_logical!(round_trip_logical_plan_19b, 70); + test_round_trip_logical!(round_trip_logical_plan_19c, 71); + test_round_trip_logical!(round_trip_logical_plan_19d, 72); + test_round_trip_logical!(round_trip_logical_plan_20a, 73); + test_round_trip_logical!(round_trip_logical_plan_20b, 74); + test_round_trip_logical!(round_trip_logical_plan_20c, 75); + test_round_trip_logical!(round_trip_logical_plan_21a, 76); + test_round_trip_logical!(round_trip_logical_plan_21b, 77); + test_round_trip_logical!(round_trip_logical_plan_21c, 78); + test_round_trip_logical!(round_trip_logical_plan_22a, 79); + test_round_trip_logical!(round_trip_logical_plan_22b, 80); + test_round_trip_logical!(round_trip_logical_plan_22c, 81); + test_round_trip_logical!(round_trip_logical_plan_22d, 82); + test_round_trip_logical!(round_trip_logical_plan_23a, 83); + test_round_trip_logical!(round_trip_logical_plan_23b, 84); + test_round_trip_logical!(round_trip_logical_plan_23c, 85); + test_round_trip_logical!(round_trip_logical_plan_24a, 86); + test_round_trip_logical!(round_trip_logical_plan_24b, 87); + test_round_trip_logical!(round_trip_logical_plan_25a, 88); + test_round_trip_logical!(round_trip_logical_plan_25b, 89); + test_round_trip_logical!(round_trip_logical_plan_25c, 90); + test_round_trip_logical!(round_trip_logical_plan_26a, 91); + test_round_trip_logical!(round_trip_logical_plan_26b, 92); + test_round_trip_logical!(round_trip_logical_plan_26c, 93); + test_round_trip_logical!(round_trip_logical_plan_27a, 94); + test_round_trip_logical!(round_trip_logical_plan_27b, 95); + test_round_trip_logical!(round_trip_logical_plan_27c, 96); + test_round_trip_logical!(round_trip_logical_plan_28a, 97); + test_round_trip_logical!(round_trip_logical_plan_28b, 98); + test_round_trip_logical!(round_trip_logical_plan_28c, 99); + test_round_trip_logical!(round_trip_logical_plan_29a, 100); + test_round_trip_logical!(round_trip_logical_plan_29b, 101); + test_round_trip_logical!(round_trip_logical_plan_29c, 102); + test_round_trip_logical!(round_trip_logical_plan_30a, 103); + test_round_trip_logical!(round_trip_logical_plan_30b, 104); + test_round_trip_logical!(round_trip_logical_plan_30c, 105); + test_round_trip_logical!(round_trip_logical_plan_31a, 106); + test_round_trip_logical!(round_trip_logical_plan_31b, 107); + test_round_trip_logical!(round_trip_logical_plan_31c, 108); + test_round_trip_logical!(round_trip_logical_plan_32a, 109); + test_round_trip_logical!(round_trip_logical_plan_32b, 110); + test_round_trip_logical!(round_trip_logical_plan_33a, 111); + test_round_trip_logical!(round_trip_logical_plan_33b, 112); + test_round_trip_logical!(round_trip_logical_plan_33c, 113); + + // physical plan tests + test_round_trip_physical!(round_trip_physical_plan_1a, 1); + test_round_trip_physical!(round_trip_physical_plan_1b, 2); + test_round_trip_physical!(round_trip_physical_plan_1c, 3); + test_round_trip_physical!(round_trip_physical_plan_1d, 4); + test_round_trip_physical!(round_trip_physical_plan_2a, 5); + test_round_trip_physical!(round_trip_physical_plan_2b, 6); + test_round_trip_physical!(round_trip_physical_plan_2c, 7); + test_round_trip_physical!(round_trip_physical_plan_2d, 8); + test_round_trip_physical!(round_trip_physical_plan_3a, 9); + test_round_trip_physical!(round_trip_physical_plan_3b, 10); + test_round_trip_physical!(round_trip_physical_plan_3c, 11); + test_round_trip_physical!(round_trip_physical_plan_4a, 12); + test_round_trip_physical!(round_trip_physical_plan_4b, 13); + test_round_trip_physical!(round_trip_physical_plan_4c, 14); + test_round_trip_physical!(round_trip_physical_plan_5a, 15); + test_round_trip_physical!(round_trip_physical_plan_5b, 16); + test_round_trip_physical!(round_trip_physical_plan_5c, 17); + test_round_trip_physical!(round_trip_physical_plan_6a, 18); + test_round_trip_physical!(round_trip_physical_plan_6b, 19); + test_round_trip_physical!(round_trip_physical_plan_6c, 20); + test_round_trip_physical!(round_trip_physical_plan_6d, 21); + test_round_trip_physical!(round_trip_physical_plan_6e, 22); + test_round_trip_physical!(round_trip_physical_plan_6f, 23); + test_round_trip_physical!(round_trip_physical_plan_7a, 24); + test_round_trip_physical!(round_trip_physical_plan_7b, 25); + test_round_trip_physical!(round_trip_physical_plan_7c, 26); + test_round_trip_physical!(round_trip_physical_plan_8a, 27); + test_round_trip_physical!(round_trip_physical_plan_8b, 28); + test_round_trip_physical!(round_trip_physical_plan_8c, 29); + test_round_trip_physical!(round_trip_physical_plan_8d, 30); + test_round_trip_physical!(round_trip_physical_plan_9a, 31); + test_round_trip_physical!(round_trip_physical_plan_9b, 32); + test_round_trip_physical!(round_trip_physical_plan_9c, 33); + test_round_trip_physical!(round_trip_physical_plan_9d, 34); + test_round_trip_physical!(round_trip_physical_plan_10a, 35); + test_round_trip_physical!(round_trip_physical_plan_10b, 36); + test_round_trip_physical!(round_trip_physical_plan_10c, 37); + test_round_trip_physical!(round_trip_physical_plan_11a, 38); + test_round_trip_physical!(round_trip_physical_plan_11b, 39); + test_round_trip_physical!(round_trip_physical_plan_11c, 40); + test_round_trip_physical!(round_trip_physical_plan_11d, 41); + test_round_trip_physical!(round_trip_physical_plan_12a, 42); + test_round_trip_physical!(round_trip_physical_plan_12b, 43); + test_round_trip_physical!(round_trip_physical_plan_12c, 44); + test_round_trip_physical!(round_trip_physical_plan_13a, 45); + test_round_trip_physical!(round_trip_physical_plan_13b, 46); + test_round_trip_physical!(round_trip_physical_plan_13c, 47); + test_round_trip_physical!(round_trip_physical_plan_13d, 48); + test_round_trip_physical!(round_trip_physical_plan_14a, 49); + test_round_trip_physical!(round_trip_physical_plan_14b, 50); + test_round_trip_physical!(round_trip_physical_plan_14c, 51); + test_round_trip_physical!(round_trip_physical_plan_15a, 52); + test_round_trip_physical!(round_trip_physical_plan_15b, 53); + test_round_trip_physical!(round_trip_physical_plan_15c, 54); + test_round_trip_physical!(round_trip_physical_plan_15d, 55); + test_round_trip_physical!(round_trip_physical_plan_16a, 56); + test_round_trip_physical!(round_trip_physical_plan_16b, 57); + test_round_trip_physical!(round_trip_physical_plan_16c, 58); + test_round_trip_physical!(round_trip_physical_plan_16d, 59); + test_round_trip_physical!(round_trip_physical_plan_17a, 60); + test_round_trip_physical!(round_trip_physical_plan_17b, 61); + test_round_trip_physical!(round_trip_physical_plan_17c, 62); + test_round_trip_physical!(round_trip_physical_plan_17d, 63); + test_round_trip_physical!(round_trip_physical_plan_17e, 64); + test_round_trip_physical!(round_trip_physical_plan_17f, 65); + test_round_trip_physical!(round_trip_physical_plan_18a, 66); + test_round_trip_physical!(round_trip_physical_plan_18b, 67); + test_round_trip_physical!(round_trip_physical_plan_18c, 68); + test_round_trip_physical!(round_trip_physical_plan_19a, 69); + test_round_trip_physical!(round_trip_physical_plan_19b, 70); + test_round_trip_physical!(round_trip_physical_plan_19c, 71); + test_round_trip_physical!(round_trip_physical_plan_19d, 72); + test_round_trip_physical!(round_trip_physical_plan_20a, 73); + test_round_trip_physical!(round_trip_physical_plan_20b, 74); + test_round_trip_physical!(round_trip_physical_plan_20c, 75); + test_round_trip_physical!(round_trip_physical_plan_21a, 76); + test_round_trip_physical!(round_trip_physical_plan_21b, 77); + test_round_trip_physical!(round_trip_physical_plan_21c, 78); + test_round_trip_physical!(round_trip_physical_plan_22a, 79); + test_round_trip_physical!(round_trip_physical_plan_22b, 80); + test_round_trip_physical!(round_trip_physical_plan_22c, 81); + test_round_trip_physical!(round_trip_physical_plan_22d, 82); + test_round_trip_physical!(round_trip_physical_plan_23a, 83); + test_round_trip_physical!(round_trip_physical_plan_23b, 84); + test_round_trip_physical!(round_trip_physical_plan_23c, 85); + test_round_trip_physical!(round_trip_physical_plan_24a, 86); + test_round_trip_physical!(round_trip_physical_plan_24b, 87); + test_round_trip_physical!(round_trip_physical_plan_25a, 88); + test_round_trip_physical!(round_trip_physical_plan_25b, 89); + test_round_trip_physical!(round_trip_physical_plan_25c, 90); + test_round_trip_physical!(round_trip_physical_plan_26a, 91); + test_round_trip_physical!(round_trip_physical_plan_26b, 92); + test_round_trip_physical!(round_trip_physical_plan_26c, 93); + test_round_trip_physical!(round_trip_physical_plan_27a, 94); + test_round_trip_physical!(round_trip_physical_plan_27b, 95); + test_round_trip_physical!(round_trip_physical_plan_27c, 96); + test_round_trip_physical!(round_trip_physical_plan_28a, 97); + test_round_trip_physical!(round_trip_physical_plan_28b, 98); + test_round_trip_physical!(round_trip_physical_plan_28c, 99); + test_round_trip_physical!(round_trip_physical_plan_29a, 100); + test_round_trip_physical!(round_trip_physical_plan_29b, 101); + test_round_trip_physical!(round_trip_physical_plan_29c, 102); + test_round_trip_physical!(round_trip_physical_plan_30a, 103); + test_round_trip_physical!(round_trip_physical_plan_30b, 104); + test_round_trip_physical!(round_trip_physical_plan_30c, 105); + test_round_trip_physical!(round_trip_physical_plan_31a, 106); + test_round_trip_physical!(round_trip_physical_plan_31b, 107); + test_round_trip_physical!(round_trip_physical_plan_31c, 108); + test_round_trip_physical!(round_trip_physical_plan_32a, 109); + test_round_trip_physical!(round_trip_physical_plan_32b, 110); + test_round_trip_physical!(round_trip_physical_plan_33a, 111); + test_round_trip_physical!(round_trip_physical_plan_33b, 112); + test_round_trip_physical!(round_trip_physical_plan_33c, 113); +} From 642a81209e27bf13a150f124012e45fb055ab013 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 4 Oct 2024 02:52:04 -0400 Subject: [PATCH 02/22] Minor: avoid clone while calculating union equivalence properties (#12722) * Minor: avoid clone while calculating union equivalence properties * Update datafusion/physical-expr/src/equivalence/properties.rs * fmt --- .../src/equivalence/properties.rs | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs index 8137b4f9da13..51e85e25e077 100644 --- a/datafusion/physical-expr/src/equivalence/properties.rs +++ b/datafusion/physical-expr/src/equivalence/properties.rs @@ -34,7 +34,7 @@ use crate::{ use arrow_schema::{SchemaRef, SortOptions}; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; -use datafusion_common::{plan_err, JoinSide, JoinType, Result}; +use datafusion_common::{internal_err, plan_err, JoinSide, JoinType, Result}; use datafusion_expr::interval_arithmetic::Interval; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; use datafusion_physical_expr_common::utils::ExprPropertiesNode; @@ -1677,14 +1677,22 @@ pub fn calculate_union( ) -> Result { // TODO: In some cases, we should be able to preserve some equivalence // classes. Add support for such cases. - let mut init = eqps[0].clone(); + let mut iter = eqps.into_iter(); + let Some(mut acc) = iter.next() else { + return internal_err!( + "Cannot calculate EquivalenceProperties for a union with no inputs" + ); + }; + // Harmonize the schema of the init with the schema of the union: - if !init.schema.eq(&schema) { - init = init.with_new_schema(schema)?; + if !acc.schema.eq(&schema) { + acc = acc.with_new_schema(schema)?; + } + // Fold in the rest of the EquivalenceProperties: + for props in iter { + acc = calculate_union_binary(acc, props)?; } - eqps.into_iter() - .skip(1) - .try_fold(init, calculate_union_binary) + Ok(acc) } #[cfg(test)] From 31cbc439cf8d5d6a85ee4f54e8fba8263a5f5370 Mon Sep 17 00:00:00 2001 From: mertak-synnada Date: Fri, 4 Oct 2024 14:00:37 +0300 Subject: [PATCH 03/22] Simplify streaming_merge function parameters (#12719) * simplify streaming_merge function parameters * revert test change * change StreamingMergeConfig into builder pattern --- .../sort_preserving_repartition_fuzz.rs | 19 ++- .../physical-plan/src/aggregates/row_hash.rs | 19 ++- .../physical-plan/src/repartition/mod.rs | 20 +-- datafusion/physical-plan/src/sorts/mod.rs | 1 - datafusion/physical-plan/src/sorts/sort.rs | 38 ++--- .../src/sorts/sort_preserving_merge.rs | 39 +++-- .../src/sorts/streaming_merge.rs | 151 +++++++++++++----- 7 files changed, 177 insertions(+), 110 deletions(-) diff --git a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs index 408cadc35f48..0cd702372f7c 100644 --- a/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs @@ -29,7 +29,7 @@ mod sp_repartition_fuzz_tests { metrics::{BaselineMetrics, ExecutionPlanMetricsSet}, repartition::RepartitionExec, sorts::sort_preserving_merge::SortPreservingMergeExec, - sorts::streaming_merge::streaming_merge, + sorts::streaming_merge::StreamingMergeBuilder, stream::RecordBatchStreamAdapter, ExecutionPlan, Partitioning, }; @@ -246,15 +246,14 @@ mod sp_repartition_fuzz_tests { MemoryConsumer::new("test".to_string()).register(context.memory_pool()); // Internally SortPreservingMergeExec uses this function for merging. - let res = streaming_merge( - streams, - schema, - &exprs, - BaselineMetrics::new(&ExecutionPlanMetricsSet::new(), 0), - 1, - None, - mem_reservation, - )?; + let res = StreamingMergeBuilder::new() + .with_streams(streams) + .with_schema(schema) + .with_expressions(&exprs) + .with_metrics(BaselineMetrics::new(&ExecutionPlanMetricsSet::new(), 0)) + .with_batch_size(1) + .with_reservation(mem_reservation) + .build()?; let res = collect(res).await?; // Contains the merged result. let res = concat_batches(&res[0].schema(), &res)?; diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 998f6184f321..9e4968f1123e 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -29,7 +29,7 @@ use crate::aggregates::{ }; use crate::metrics::{BaselineMetrics, MetricBuilder, RecordOutput}; use crate::sorts::sort::sort_batch; -use crate::sorts::streaming_merge; +use crate::sorts::streaming_merge::StreamingMergeBuilder; use crate::spill::{read_spill_as_stream, spill_record_batch_by_size}; use crate::stream::RecordBatchStreamAdapter; use crate::{aggregates, metrics, ExecutionPlan, PhysicalExpr}; @@ -1001,15 +1001,14 @@ impl GroupedHashAggregateStream { streams.push(stream); } self.spill_state.is_stream_merging = true; - self.input = streaming_merge( - streams, - schema, - &self.spill_state.spill_expr, - self.baseline_metrics.clone(), - self.batch_size, - None, - self.reservation.new_empty(), - )?; + self.input = StreamingMergeBuilder::new() + .with_streams(streams) + .with_schema(schema) + .with_expressions(&self.spill_state.spill_expr) + .with_metrics(self.baseline_metrics.clone()) + .with_batch_size(self.batch_size) + .with_reservation(self.reservation.new_empty()) + .build()?; self.input_done = false; self.group_ordering = GroupOrdering::Full(GroupOrderingFull::new()); Ok(()) diff --git a/datafusion/physical-plan/src/repartition/mod.rs b/datafusion/physical-plan/src/repartition/mod.rs index 4fd364cca4d0..f0f198319ee3 100644 --- a/datafusion/physical-plan/src/repartition/mod.rs +++ b/datafusion/physical-plan/src/repartition/mod.rs @@ -34,7 +34,7 @@ use crate::metrics::BaselineMetrics; use crate::repartition::distributor_channels::{ channels, partition_aware_channels, DistributionReceiver, DistributionSender, }; -use crate::sorts::streaming_merge; +use crate::sorts::streaming_merge::StreamingMergeBuilder; use crate::stream::RecordBatchStreamAdapter; use crate::{DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics}; @@ -637,15 +637,15 @@ impl ExecutionPlan for RepartitionExec { let merge_reservation = MemoryConsumer::new(format!("{}[Merge {partition}]", name)) .register(context.memory_pool()); - streaming_merge( - input_streams, - schema_captured, - &sort_exprs, - BaselineMetrics::new(&metrics, partition), - context.session_config().batch_size(), - fetch, - merge_reservation, - ) + StreamingMergeBuilder::new() + .with_streams(input_streams) + .with_schema(schema_captured) + .with_expressions(&sort_exprs) + .with_metrics(BaselineMetrics::new(&metrics, partition)) + .with_batch_size(context.session_config().batch_size()) + .with_fetch(fetch) + .with_reservation(merge_reservation) + .build() } else { Ok(Box::pin(RepartitionStream { num_input_partitions, diff --git a/datafusion/physical-plan/src/sorts/mod.rs b/datafusion/physical-plan/src/sorts/mod.rs index 7c084761fdc3..ab5df37ed327 100644 --- a/datafusion/physical-plan/src/sorts/mod.rs +++ b/datafusion/physical-plan/src/sorts/mod.rs @@ -28,4 +28,3 @@ mod stream; pub mod streaming_merge; pub use index::RowIndex; -pub(crate) use streaming_merge::streaming_merge; diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 91816713c6c3..50f6f4a93097 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -30,7 +30,7 @@ use crate::limit::LimitStream; use crate::metrics::{ BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet, }; -use crate::sorts::streaming_merge::streaming_merge; +use crate::sorts::streaming_merge::StreamingMergeBuilder; use crate::spill::{read_spill_as_stream, spill_record_batches}; use crate::stream::RecordBatchStreamAdapter; use crate::topk::TopK; @@ -342,15 +342,15 @@ impl ExternalSorter { streams.push(stream); } - streaming_merge( - streams, - Arc::clone(&self.schema), - &self.expr, - self.metrics.baseline.clone(), - self.batch_size, - self.fetch, - self.reservation.new_empty(), - ) + StreamingMergeBuilder::new() + .with_streams(streams) + .with_schema(Arc::clone(&self.schema)) + .with_expressions(&self.expr) + .with_metrics(self.metrics.baseline.clone()) + .with_batch_size(self.batch_size) + .with_fetch(self.fetch) + .with_reservation(self.reservation.new_empty()) + .build() } else { self.in_mem_sort_stream(self.metrics.baseline.clone()) } @@ -534,15 +534,15 @@ impl ExternalSorter { }) .collect::>()?; - streaming_merge( - streams, - Arc::clone(&self.schema), - &self.expr, - metrics, - self.batch_size, - self.fetch, - self.merge_reservation.new_empty(), - ) + StreamingMergeBuilder::new() + .with_streams(streams) + .with_schema(Arc::clone(&self.schema)) + .with_expressions(&self.expr) + .with_metrics(metrics) + .with_batch_size(self.batch_size) + .with_fetch(self.fetch) + .with_reservation(self.merge_reservation.new_empty()) + .build() } /// Sorts a single `RecordBatch` into a single stream. diff --git a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs index b00a11a5355f..3d3f9dcb98ee 100644 --- a/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs +++ b/datafusion/physical-plan/src/sorts/sort_preserving_merge.rs @@ -24,7 +24,7 @@ use crate::common::spawn_buffered; use crate::expressions::PhysicalSortExpr; use crate::limit::LimitStream; use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; -use crate::sorts::streaming_merge; +use crate::sorts::streaming_merge::StreamingMergeBuilder; use crate::{ DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties, Partitioning, PlanProperties, SendableRecordBatchStream, Statistics, @@ -273,15 +273,15 @@ impl ExecutionPlan for SortPreservingMergeExec { debug!("Done setting up sender-receiver for SortPreservingMergeExec::execute"); - let result = streaming_merge( - receivers, - schema, - &self.expr, - BaselineMetrics::new(&self.metrics, partition), - context.session_config().batch_size(), - self.fetch, - reservation, - )?; + let result = StreamingMergeBuilder::new() + .with_streams(receivers) + .with_schema(schema) + .with_expressions(&self.expr) + .with_metrics(BaselineMetrics::new(&self.metrics, partition)) + .with_batch_size(context.session_config().batch_size()) + .with_fetch(self.fetch) + .with_reservation(reservation) + .build()?; debug!("Got stream result from SortPreservingMergeStream::new_from_receivers"); @@ -960,16 +960,15 @@ mod tests { MemoryConsumer::new("test").register(&task_ctx.runtime_env().memory_pool); let fetch = None; - let merge_stream = streaming_merge( - streams, - batches.schema(), - sort.as_slice(), - BaselineMetrics::new(&metrics, 0), - task_ctx.session_config().batch_size(), - fetch, - reservation, - ) - .unwrap(); + let merge_stream = StreamingMergeBuilder::new() + .with_streams(streams) + .with_schema(batches.schema()) + .with_expressions(sort.as_slice()) + .with_metrics(BaselineMetrics::new(&metrics, 0)) + .with_batch_size(task_ctx.session_config().batch_size()) + .with_fetch(fetch) + .with_reservation(reservation) + .build()?; let mut merged = common::collect(merge_stream).await.unwrap(); diff --git a/datafusion/physical-plan/src/sorts/streaming_merge.rs b/datafusion/physical-plan/src/sorts/streaming_merge.rs index 9e6618dd1af5..ad640d8e8470 100644 --- a/datafusion/physical-plan/src/sorts/streaming_merge.rs +++ b/datafusion/physical-plan/src/sorts/streaming_merge.rs @@ -49,49 +49,120 @@ macro_rules! merge_helper { }}; } -/// Perform a streaming merge of [`SendableRecordBatchStream`] based on provided sort expressions -/// while preserving order. -pub fn streaming_merge( +#[derive(Default)] +pub struct StreamingMergeBuilder<'a> { streams: Vec, - schema: SchemaRef, - expressions: &[PhysicalSortExpr], - metrics: BaselineMetrics, - batch_size: usize, + schema: Option, + expressions: &'a [PhysicalSortExpr], + metrics: Option, + batch_size: Option, fetch: Option, - reservation: MemoryReservation, -) -> Result { - // If there are no sort expressions, preserving the order - // doesn't mean anything (and result in infinite loops) - if expressions.is_empty() { - return internal_err!("Sort expressions cannot be empty for streaming merge"); + reservation: Option, +} + +impl<'a> StreamingMergeBuilder<'a> { + pub fn new() -> Self { + Self::default() } - // Special case single column comparisons with optimized cursor implementations - if expressions.len() == 1 { - let sort = expressions[0].clone(); - let data_type = sort.expr.data_type(schema.as_ref())?; - downcast_primitive! { - data_type => (primitive_merge_helper, sort, streams, schema, metrics, batch_size, fetch, reservation), - DataType::Utf8 => merge_helper!(StringArray, sort, streams, schema, metrics, batch_size, fetch, reservation) - DataType::LargeUtf8 => merge_helper!(LargeStringArray, sort, streams, schema, metrics, batch_size, fetch, reservation) - DataType::Binary => merge_helper!(BinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation) - DataType::LargeBinary => merge_helper!(LargeBinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation) - _ => {} - } + + pub fn with_streams(mut self, streams: Vec) -> Self { + self.streams = streams; + self } - let streams = RowCursorStream::try_new( - schema.as_ref(), - expressions, - streams, - reservation.new_empty(), - )?; - - Ok(Box::pin(SortPreservingMergeStream::new( - Box::new(streams), - schema, - metrics, - batch_size, - fetch, - reservation, - ))) + pub fn with_schema(mut self, schema: SchemaRef) -> Self { + self.schema = Some(schema); + self + } + + pub fn with_expressions(mut self, expressions: &'a [PhysicalSortExpr]) -> Self { + self.expressions = expressions; + self + } + + pub fn with_metrics(mut self, metrics: BaselineMetrics) -> Self { + self.metrics = Some(metrics); + self + } + + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.batch_size = Some(batch_size); + self + } + + pub fn with_fetch(mut self, fetch: Option) -> Self { + self.fetch = fetch; + self + } + + pub fn with_reservation(mut self, reservation: MemoryReservation) -> Self { + self.reservation = Some(reservation); + self + } + + pub fn build(self) -> Result { + let Self { + streams, + schema, + metrics, + batch_size, + reservation, + fetch, + expressions, + } = self; + + // Early return if streams or expressions are empty + let checks = [ + ( + streams.is_empty(), + "Streams cannot be empty for streaming merge", + ), + ( + expressions.is_empty(), + "Sort expressions cannot be empty for streaming merge", + ), + ]; + + if let Some((_, error_message)) = checks.iter().find(|(condition, _)| *condition) + { + return internal_err!("{}", error_message); + } + + // Unwrapping mandatory fields + let schema = schema.expect("Schema cannot be empty for streaming merge"); + let metrics = metrics.expect("Metrics cannot be empty for streaming merge"); + let batch_size = + batch_size.expect("Batch size cannot be empty for streaming merge"); + let reservation = + reservation.expect("Reservation cannot be empty for streaming merge"); + + // Special case single column comparisons with optimized cursor implementations + if expressions.len() == 1 { + let sort = expressions[0].clone(); + let data_type = sort.expr.data_type(schema.as_ref())?; + downcast_primitive! { + data_type => (primitive_merge_helper, sort, streams, schema, metrics, batch_size, fetch, reservation), + DataType::Utf8 => merge_helper!(StringArray, sort, streams, schema, metrics, batch_size, fetch, reservation) + DataType::LargeUtf8 => merge_helper!(LargeStringArray, sort, streams, schema, metrics, batch_size, fetch, reservation) + DataType::Binary => merge_helper!(BinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation) + DataType::LargeBinary => merge_helper!(LargeBinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation) + _ => {} + } + } + + let streams = RowCursorStream::try_new( + schema.as_ref(), + expressions, + streams, + reservation.new_empty(), + )?; + Ok(Box::pin(SortPreservingMergeStream::new( + Box::new(streams), + schema, + metrics, + batch_size, + fetch, + reservation, + ))) + } } From d4bc1c1356fff6230a96c5ff6043633c132754a7 Mon Sep 17 00:00:00 2001 From: Bryce Mecum Date: Fri, 4 Oct 2024 04:13:56 -0700 Subject: [PATCH 04/22] Fix links on docs index page (#12750) --- docs/source/index.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 959b964026be..f11670d259bf 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -36,15 +36,15 @@ Apache DataFusion DataFusion is an extensible query engine written in `Rust `_ that uses `Apache Arrow `_ as its in-memory format. -This documentation is for the core DataFusion project, which contains +This documentation is for the `core DataFusion project `_, which contains libraries that are used to build data-centric system software. DataFusion also offers the following subprojects, which provide packaged versions of DataFusion intended for end users, and these have separate documentation. -- DataFusion Python offers a Python interface for SQL and DataFrame +- `DataFusion Python `_ offers a Python interface for SQL and DataFrame queries. -- DataFusion Ray provides a distributed version of DataFusion - that scales out on Ray clusters. -- DataFusion Comet is an accelerator for Apache Spark based on +- `DataFusion Ray `_ provides a distributed version of DataFusion + that scales out on `Ray `_ clusters. +- `DataFusion Comet `_ is an accelerator for Apache Spark based on DataFusion. DataFusion's target users are From 48bff750e690ee17136fa42467709eabfab8fc1e Mon Sep 17 00:00:00 2001 From: wiedld Date: Fri, 4 Oct 2024 08:28:12 -0700 Subject: [PATCH 05/22] Provide field and schema metadata missing on cross joins, and union with null fields. (#12729) * test: reproducer for missing schema metadata on cross join * fix: pass thru schema metadata on cross join * fix: preserve metadata when transforming to view types * test: reproducer for missing field metadata in left hand NULL field of union * fix: preserve field metadata from right side of union * chore: safe indexing --- .../core/src/datasource/file_format/mod.rs | 18 +++++------ .../physical-plan/src/joins/cross_join.rs | 13 ++++++-- datafusion/physical-plan/src/union.rs | 11 ++++++- datafusion/sqllogictest/src/test_context.rs | 8 ++++- .../sqllogictest/test_files/metadata.slt | 31 ++++++++++++++++++- 5 files changed, 65 insertions(+), 16 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index 60f2b2dcefa9..e16986c660ad 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -241,16 +241,14 @@ pub fn transform_schema_to_view(schema: &Schema) -> Schema { .fields .iter() .map(|field| match field.data_type() { - DataType::Utf8 | DataType::LargeUtf8 => Arc::new(Field::new( - field.name(), - DataType::Utf8View, - field.is_nullable(), - )), - DataType::Binary | DataType::LargeBinary => Arc::new(Field::new( - field.name(), - DataType::BinaryView, - field.is_nullable(), - )), + DataType::Utf8 | DataType::LargeUtf8 => Arc::new( + Field::new(field.name(), DataType::Utf8View, field.is_nullable()) + .with_metadata(field.metadata().to_owned()), + ), + DataType::Binary | DataType::LargeBinary => Arc::new( + Field::new(field.name(), DataType::BinaryView, field.is_nullable()) + .with_metadata(field.metadata().to_owned()), + ), _ => field.clone(), }) .collect(); diff --git a/datafusion/physical-plan/src/joins/cross_join.rs b/datafusion/physical-plan/src/joins/cross_join.rs index 11153556f253..a70645f3d6c0 100644 --- a/datafusion/physical-plan/src/joins/cross_join.rs +++ b/datafusion/physical-plan/src/joins/cross_join.rs @@ -69,15 +69,22 @@ impl CrossJoinExec { /// Create a new [CrossJoinExec]. pub fn new(left: Arc, right: Arc) -> Self { // left then right - let all_columns: Fields = { + let (all_columns, metadata) = { let left_schema = left.schema(); let right_schema = right.schema(); let left_fields = left_schema.fields().iter(); let right_fields = right_schema.fields().iter(); - left_fields.chain(right_fields).cloned().collect() + + let mut metadata = left_schema.metadata().clone(); + metadata.extend(right_schema.metadata().clone()); + + ( + left_fields.chain(right_fields).cloned().collect::(), + metadata, + ) }; - let schema = Arc::new(Schema::new(all_columns)); + let schema = Arc::new(Schema::new(all_columns).with_metadata(metadata)); let cache = Self::compute_properties(&left, &right, Arc::clone(&schema)); CrossJoinExec { left, diff --git a/datafusion/physical-plan/src/union.rs b/datafusion/physical-plan/src/union.rs index 78b25686054d..1cf22060b62a 100644 --- a/datafusion/physical-plan/src/union.rs +++ b/datafusion/physical-plan/src/union.rs @@ -474,7 +474,16 @@ fn union_schema(inputs: &[Arc]) -> SchemaRef { .iter() .filter_map(|input| { if input.schema().fields().len() > i { - Some(input.schema().field(i).clone()) + let field = input.schema().field(i).clone(); + let right_hand_metdata = inputs + .get(1) + .map(|right_input| { + right_input.schema().field(i).metadata().clone() + }) + .unwrap_or_default(); + let mut metadata = field.metadata().clone(); + metadata.extend(right_hand_metdata); + Some(field.with_metadata(metadata)) } else { None } diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index d3ee720467b6..9a0db1c41c71 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -314,8 +314,13 @@ pub async fn register_metadata_tables(ctx: &SessionContext) { String::from("metadata_key"), String::from("the name field"), )])); + let l_name = + Field::new("l_name", DataType::Utf8, true).with_metadata(HashMap::from([( + String::from("metadata_key"), + String::from("the l_name field"), + )])); - let schema = Schema::new(vec![id, name]).with_metadata(HashMap::from([( + let schema = Schema::new(vec![id, name, l_name]).with_metadata(HashMap::from([( String::from("metadata_key"), String::from("the entire schema"), )])); @@ -325,6 +330,7 @@ pub async fn register_metadata_tables(ctx: &SessionContext) { vec![ Arc::new(Int32Array::from(vec![Some(1), None, Some(3)])) as _, Arc::new(StringArray::from(vec![None, Some("bar"), Some("baz")])) as _, + Arc::new(StringArray::from(vec![None, Some("l_bar"), Some("l_baz")])) as _, ], ) .unwrap(); diff --git a/datafusion/sqllogictest/test_files/metadata.slt b/datafusion/sqllogictest/test_files/metadata.slt index f38281abc5ab..d0853b9e4983 100644 --- a/datafusion/sqllogictest/test_files/metadata.slt +++ b/datafusion/sqllogictest/test_files/metadata.slt @@ -25,7 +25,7 @@ ## with metadata in SQL. query IT -select * from table_with_metadata; +select id, name from table_with_metadata; ---- 1 NULL NULL bar @@ -96,5 +96,34 @@ select count(id) cnt from table_with_metadata group by name order by cnt; 1 + +# Regression test: missing schema metadata, when aggregate on cross join +query I +SELECT count("data"."id") +FROM + ( + SELECT "id" FROM "table_with_metadata" + ) as "data", + ( + SELECT "id" FROM "table_with_metadata" + ) as "samples"; +---- +6 + +# Regression test: missing field metadata, from the NULL field on the left side of the union +query ITT +(SELECT id, NULL::string as name, l_name FROM "table_with_metadata") + UNION +(SELECT id, name, NULL::string as l_name FROM "table_with_metadata") +ORDER BY id, name, l_name; +---- +1 NULL NULL +3 baz NULL +3 NULL l_baz +NULL bar NULL +NULL NULL l_bar + + + statement ok drop table table_with_metadata; From cf76aba36d56f775a61bb434da5e8e3b07d9061d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 4 Oct 2024 17:38:36 -0400 Subject: [PATCH 06/22] Minor: Update string tests for strpos (#12739) --- .../test_files/string/large_string.slt | 17 ----------- .../sqllogictest/test_files/string/string.slt | 17 ----------- .../test_files/string/string_query.slt.part | 30 +++++++++---------- .../test_files/string/string_view.slt | 17 ----------- 4 files changed, 14 insertions(+), 67 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string/large_string.slt b/datafusion/sqllogictest/test_files/string/large_string.slt index 169c658e5ac1..af6d104e57ac 100644 --- a/datafusion/sqllogictest/test_files/string/large_string.slt +++ b/datafusion/sqllogictest/test_files/string/large_string.slt @@ -72,23 +72,6 @@ false false false true NULL NULL -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12670 -query IIIIII -SELECT - STRPOS(ascii_1, 'e'), - STRPOS(ascii_1, 'ang'), - STRPOS(ascii_1, NULL), - STRPOS(unicode_1, 'и'), - STRPOS(unicode_1, 'ион'), - STRPOS(unicode_1, NULL) -FROM test_basic_operator; ----- -5 0 NULL 0 0 NULL -7 3 NULL 0 0 NULL -6 0 NULL 18 18 NULL -NULL NULL NULL NULL NULL NULL - # # common test for string-like functions and operators # diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index f4e83966f78f..f003e01ecda0 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -63,23 +63,6 @@ Xiangpeng datafusion数据融合 false true false true Raphael datafusionДатаФусион false false false false NULL NULL NULL NULL NULL NULL -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12670 -query IIIIII -SELECT - STRPOS(ascii_1, 'e'), - STRPOS(ascii_1, 'ang'), - STRPOS(ascii_1, NULL), - STRPOS(unicode_1, 'и'), - STRPOS(unicode_1, 'ион'), - STRPOS(unicode_1, NULL) -FROM test_basic_operator; ----- -5 0 NULL 0 0 NULL -7 3 NULL 0 0 NULL -6 0 NULL 18 18 NULL -NULL NULL NULL NULL NULL NULL - # # common test for string-like functions and operators # diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 3ba2b31bbab2..6a02296f5e6c 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -951,22 +951,20 @@ NULL NULL # Test STRPOS # -------------------------------------- -# TODO: DictionaryString does not support STRPOS. Enable this after fixing the issue -# see issue: https://github.com/apache/datafusion/issues/12670 -#query IIIIII -#SELECT -# STRPOS(ascii_1, 'e'), -# STRPOS(ascii_1, 'ang'), -# STRPOS(ascii_1, NULL), -# STRPOS(unicode_1, 'и'), -# STRPOS(unicode_1, 'ион'), -# STRPOS(unicode_1, NULL) -#FROM test_basic_operator; -#---- -#5 0 NULL 0 0 NULL -#7 3 NULL 0 0 NULL -#6 0 NULL 18 18 NULL -#NULL NULL NULL NULL NULL NULL +query IIIIII +SELECT + STRPOS(ascii_1, 'e'), + STRPOS(ascii_1, 'ang'), + STRPOS(ascii_1, NULL), + STRPOS(unicode_1, 'и'), + STRPOS(unicode_1, 'ион'), + STRPOS(unicode_1, NULL) +FROM test_basic_operator; +---- +5 0 NULL 0 0 NULL +7 3 NULL 0 0 NULL +6 0 NULL 18 18 NULL +NULL NULL NULL NULL NULL NULL # -------------------------------------- # Test SUBSTR_INDEX diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 4e7857ad804b..e7b55c9c1c8c 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -50,23 +50,6 @@ false false false true NULL NULL -# TODO: move it back to `string_query.slt.part` after fixing the issue -# see detail: https://github.com/apache/datafusion/issues/12670 -query IIIIII -SELECT - STRPOS(ascii_1, 'e'), - STRPOS(ascii_1, 'ang'), - STRPOS(ascii_1, NULL), - STRPOS(unicode_1, 'и'), - STRPOS(unicode_1, 'ион'), - STRPOS(unicode_1, NULL) -FROM test_basic_operator; ----- -5 0 NULL 0 0 NULL -7 3 NULL 0 0 NULL -6 0 NULL 18 18 NULL -NULL NULL NULL NULL NULL NULL - # # common test for string-like functions and operators # From 8aafa5498024e45b58ad2068a80cf5942babe55b Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Sat, 5 Oct 2024 10:00:53 +0800 Subject: [PATCH 07/22] Apply `type_union_resolution` to array and values (#12753) * cleanup make array coercion rule Signed-off-by: jayzhan211 * change to type union resolution Signed-off-by: jayzhan211 * change value too Signed-off-by: jayzhan211 * fix tpyo Signed-off-by: jayzhan211 --------- Signed-off-by: jayzhan211 --- .../expr-common/src/type_coercion/binary.rs | 22 +++----- datafusion/expr/src/logical_plan/builder.rs | 5 +- .../expr/src/type_coercion/functions.rs | 23 +++++--- datafusion/functions-nested/src/make_array.rs | 54 ++++++++----------- .../optimizer/src/analyzer/type_coercion.rs | 25 --------- datafusion/sqllogictest/test_files/array.slt | 2 +- datafusion/sqllogictest/test_files/errors.slt | 2 +- datafusion/sqllogictest/test_files/map.slt | 32 ++++++----- datafusion/sqllogictest/test_files/select.slt | 10 +++- 9 files changed, 77 insertions(+), 98 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index e66a9ae1ea98..e7c4f65a1b4e 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -471,10 +471,16 @@ fn type_union_resolution_coercion( let new_value_type = type_union_resolution_coercion(value_type, other_type); new_value_type.map(|t| DataType::Dictionary(index_type.clone(), Box::new(t))) } + (DataType::List(lhs), DataType::List(rhs)) => { + let new_item_type = + type_union_resolution_coercion(lhs.data_type(), rhs.data_type()); + new_item_type.map(|t| DataType::List(Arc::new(Field::new("item", t, true)))) + } _ => { // numeric coercion is the same as comparison coercion, both find the narrowest type // that can accommodate both types binary_numeric_coercion(lhs_type, rhs_type) + .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) .or_else(|| numeric_string_coercion(lhs_type, rhs_type)) } @@ -507,22 +513,6 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - if lhs_type == rhs_type { - // same type => equality is possible - return Some(lhs_type.clone()); - } - binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) - .or_else(|| string_coercion(lhs_type, rhs_type)) - .or_else(|| binary_coercion(lhs_type, rhs_type)) -} - /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation /// where one is numeric and one is `Utf8`/`LargeUtf8`. fn string_numeric_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index cc8ddf8ec8e8..da2a96327ce5 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -35,7 +35,6 @@ use crate::logical_plan::{ Projection, Repartition, Sort, SubqueryAlias, TableScan, Union, Unnest, Values, Window, }; -use crate::type_coercion::binary::values_coercion; use crate::utils::{ can_hash, columnize_expr, compare_sort_expr, expr_to_columns, find_valid_equijoin_key_pair, group_window_expr_by_sort_keys, @@ -53,6 +52,7 @@ use datafusion_common::{ plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions, }; +use datafusion_expr_common::type_coercion::binary::type_union_resolution; use super::dml::InsertOp; use super::plan::{ColumnUnnestList, ColumnUnnestType}; @@ -209,7 +209,8 @@ impl LogicalPlanBuilder { } if let Some(prev_type) = common_type { // get common type of each column values. - let Some(new_type) = values_coercion(&data_type, &prev_type) else { + let data_types = vec![prev_type.clone(), data_type.clone()]; + let Some(new_type) = type_union_resolution(&data_types) else { return plan_err!("Inconsistent data type across values list at row {i} column {j}. Was {prev_type} but found {data_type}"); }; common_type = Some(new_type); diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 8ac6ad372482..9000ac2538e6 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -167,6 +167,20 @@ pub fn data_types( try_coerce_types(valid_types, current_types, &signature.type_signature) } +fn is_well_supported_signature(type_signature: &TypeSignature) -> bool { + if let TypeSignature::OneOf(signatures) = type_signature { + return signatures.iter().all(is_well_supported_signature); + } + + matches!( + type_signature, + TypeSignature::UserDefined + | TypeSignature::Numeric(_) + | TypeSignature::Coercible(_) + | TypeSignature::Any(_) + ) +} + fn try_coerce_types( valid_types: Vec>, current_types: &[DataType], @@ -175,14 +189,7 @@ fn try_coerce_types( let mut valid_types = valid_types; // Well-supported signature that returns exact valid types. - if !valid_types.is_empty() - && matches!( - type_signature, - TypeSignature::UserDefined - | TypeSignature::Numeric(_) - | TypeSignature::Coercible(_) - ) - { + if !valid_types.is_empty() && is_well_supported_signature(type_signature) { // exact valid types assert_eq!(valid_types.len(), 1); let valid_types = valid_types.swap_remove(0); diff --git a/datafusion/functions-nested/src/make_array.rs b/datafusion/functions-nested/src/make_array.rs index 79858041d3ca..51fc71e6b09d 100644 --- a/datafusion/functions-nested/src/make_array.rs +++ b/datafusion/functions-nested/src/make_array.rs @@ -17,6 +17,7 @@ //! [`ScalarUDFImpl`] definitions for `make_array` function. +use std::vec; use std::{any::Any, sync::Arc}; use arrow::array::{ArrayData, Capacities, MutableArrayData}; @@ -26,9 +27,8 @@ use arrow_array::{ use arrow_buffer::OffsetBuffer; use arrow_schema::DataType::{LargeList, List, Null}; use arrow_schema::{DataType, Field}; -use datafusion_common::internal_err; use datafusion_common::{plan_err, utils::array_into_list_array_nullable, Result}; -use datafusion_expr::type_coercion::binary::comparison_coercion; +use datafusion_expr::binary::type_union_resolution; use datafusion_expr::TypeSignature; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; @@ -82,19 +82,12 @@ impl ScalarUDFImpl for MakeArray { match arg_types.len() { 0 => Ok(empty_array_type()), _ => { - let mut expr_type = DataType::Null; - for arg_type in arg_types { - if !arg_type.equals_datatype(&DataType::Null) { - expr_type = arg_type.clone(); - break; - } - } - - if expr_type.is_null() { - expr_type = DataType::Int64; - } - - Ok(List(Arc::new(Field::new("item", expr_type, true)))) + // At this point, all the type in array should be coerced to the same one + Ok(List(Arc::new(Field::new( + "item", + arg_types[0].to_owned(), + true, + )))) } } } @@ -112,22 +105,21 @@ impl ScalarUDFImpl for MakeArray { } fn coerce_types(&self, arg_types: &[DataType]) -> Result> { - let new_type = arg_types.iter().skip(1).try_fold( - arg_types.first().unwrap().clone(), - |acc, x| { - // The coerced types found by `comparison_coercion` are not guaranteed to be - // coercible for the arguments. `comparison_coercion` returns more loose - // types that can be coerced to both `acc` and `x` for comparison purpose. - // See `maybe_data_types` for the actual coercion. - let coerced_type = comparison_coercion(&acc, x); - if let Some(coerced_type) = coerced_type { - Ok(coerced_type) - } else { - internal_err!("Coercion from {acc:?} to {x:?} failed.") - } - }, - )?; - Ok(vec![new_type; arg_types.len()]) + if let Some(new_type) = type_union_resolution(arg_types) { + if let DataType::FixedSizeList(field, _) = new_type { + Ok(vec![DataType::List(field); arg_types.len()]) + } else if new_type.is_null() { + Ok(vec![DataType::Int64; arg_types.len()]) + } else { + Ok(vec![new_type; arg_types.len()]) + } + } else { + plan_err!( + "Fail to find the valid type between {:?} for {}", + arg_types, + self.name() + ) + } } } diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 4dc34284c719..e5d280289342 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -456,7 +456,6 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { self.schema, &func, )?; - let new_expr = coerce_arguments_for_fun(new_expr, self.schema, &func)?; Ok(Transformed::yes(Expr::ScalarFunction( ScalarFunction::new_udf(func, new_expr), ))) @@ -756,30 +755,6 @@ fn coerce_arguments_for_signature_with_aggregate_udf( .collect() } -fn coerce_arguments_for_fun( - expressions: Vec, - schema: &DFSchema, - fun: &Arc, -) -> Result> { - // Cast Fixedsizelist to List for array functions - if fun.name() == "make_array" { - expressions - .into_iter() - .map(|expr| { - let data_type = expr.get_type(schema).unwrap(); - if let DataType::FixedSizeList(field, _) = data_type { - let to_type = DataType::List(Arc::clone(&field)); - expr.cast_to(&to_type, schema) - } else { - Ok(expr) - } - }) - .collect() - } else { - Ok(expressions) - } -} - fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { // Given expressions like: // diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index b7d60b50586d..bcd80cbe9b24 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6595,7 +6595,7 @@ select make_array(1, 2.0, null, 3) query ? select make_array(1.0, '2', null) ---- -[1.0, 2, ] +[1.0, 2.0, ] ### FixedSizeListArray diff --git a/datafusion/sqllogictest/test_files/errors.slt b/datafusion/sqllogictest/test_files/errors.slt index be7fdac71b57..7abf94932c71 100644 --- a/datafusion/sqllogictest/test_files/errors.slt +++ b/datafusion/sqllogictest/test_files/errors.slt @@ -128,5 +128,5 @@ from aggregate_test_100 order by c9 -statement error Inconsistent data type across values list at row 1 column 0. Was Int64 but found Utf8 +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'foo' to value of Int64 type create table foo as values (1), ('foo'); diff --git a/datafusion/sqllogictest/test_files/map.slt b/datafusion/sqllogictest/test_files/map.slt index 45e1b51a09b4..726de75b5141 100644 --- a/datafusion/sqllogictest/test_files/map.slt +++ b/datafusion/sqllogictest/test_files/map.slt @@ -148,18 +148,17 @@ SELECT MAKE_MAP([1,2], ['a', 'b'], [3,4], ['b']); {[1, 2]: [a, b], [3, 4]: [b]} query ? -SELECT MAKE_MAP('POST', 41, 'HEAD', 'ab', 'PATCH', 30); +SELECT MAKE_MAP('POST', 41, 'HEAD', 53, 'PATCH', 30); ---- -{POST: 41, HEAD: ab, PATCH: 30} +{POST: 41, HEAD: 53, PATCH: 30} + +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'ab' to value of Int64 type +SELECT MAKE_MAP('POST', 41, 'HEAD', 'ab', 'PATCH', 30); +# Map keys can not be NULL query error SELECT MAKE_MAP('POST', 41, 'HEAD', 33, null, 30); -query ? -SELECT MAKE_MAP('POST', 41, 'HEAD', 'ab', 'PATCH', 30); ----- -{POST: 41, HEAD: ab, PATCH: 30} - query ? SELECT MAKE_MAP() ---- @@ -517,9 +516,12 @@ query error SELECT MAP {'a': MAP {1:'a', 2:'b', 3:'c'}, 'b': MAP {2:'c', 4:'d'} }[NULL]; query ? -SELECT MAP { 'a': 1, 2: 3 }; +SELECT MAP { 'a': 1, 'b': 3 }; ---- -{a: 1, 2: 3} +{a: 1, b: 3} + +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type +SELECT MAP { 'a': 1, 2: 3 }; # TODO(https://github.com/apache/datafusion/issues/11785): fix accessing map with non-string key # query ? @@ -610,9 +612,12 @@ select map_extract(column1, 1), map_extract(column1, 5), map_extract(column1, 7) # Tests for map_keys query ? -SELECT map_keys(MAP { 'a': 1, 2: 3 }); +SELECT map_keys(MAP { 'a': 1, 'b': 3 }); ---- -[a, 2] +[a, b] + +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type +SELECT map_keys(MAP { 'a': 1, 2: 3 }); query ? SELECT map_keys(MAP {'a':1, 'b':2, 'c':3 }) FROM t; @@ -657,8 +662,11 @@ SELECT map_keys(column1) from map_array_table_1; # Tests for map_values -query ? +query error DataFusion error: Arrow error: Cast error: Cannot cast string 'a' to value of Int64 type SELECT map_values(MAP { 'a': 1, 2: 3 }); + +query ? +SELECT map_values(MAP { 'a': 1, 'b': 3 }); ---- [1, 3] diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 5df5f313af3c..0fef56aeea5c 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -348,8 +348,11 @@ VALUES (1),() statement error DataFusion error: Error during planning: Inconsistent data length across values list: got 2 values in row 1 but expected 1 VALUES (1),(1,2) -statement error DataFusion error: Error during planning: Inconsistent data type across values list at row 1 column 0 +query I VALUES (1),('2') +---- +1 +2 query R VALUES (1),(2.0) @@ -357,8 +360,11 @@ VALUES (1),(2.0) 1 2 -statement error DataFusion error: Error during planning: Inconsistent data type across values list at row 1 column 1 +query II VALUES (1,2), (1,'2') +---- +1 2 +1 2 query IT VALUES (1,'a'),(NULL,'b'),(3,'c') From 030c4e942332d716c9eabfb9297b4c75fe572377 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 5 Oct 2024 06:12:40 -0400 Subject: [PATCH 08/22] Add `DocumentationBuilder::with_standard_argument` to reduce copy/paste (#12747) * Add `DocumentationBuilder::with_standard_expression` to reduce copy/paste * fix doc * fix standard argument * Update docs * Improve documentation to explain what is different --- datafusion/expr/src/udf_docs.rs | 24 +++++++++++++++++++ .../functions-aggregate/src/bit_and_or_xor.rs | 19 +++++---------- datafusion/functions/src/crypto/sha224.rs | 3 +-- datafusion/functions/src/datetime/to_date.rs | 5 +--- datafusion/functions/src/math/log.rs | 6 ++--- datafusion/functions/src/regex/regexplike.rs | 6 ++--- datafusion/functions/src/unicode/rpad.rs | 4 ++-- .../user-guide/sql/aggregate_functions_new.md | 6 ++--- .../user-guide/sql/scalar_functions_new.md | 14 +++++------ 9 files changed, 48 insertions(+), 39 deletions(-) diff --git a/datafusion/expr/src/udf_docs.rs b/datafusion/expr/src/udf_docs.rs index 280910b87199..e8245588d945 100644 --- a/datafusion/expr/src/udf_docs.rs +++ b/datafusion/expr/src/udf_docs.rs @@ -131,6 +131,9 @@ impl DocumentationBuilder { self } + /// Adds documentation for a specific argument to the documentation. + /// + /// Arguments are displayed in the order they are added. pub fn with_argument( mut self, arg_name: impl Into, @@ -142,6 +145,27 @@ impl DocumentationBuilder { self } + /// Add a standard "expression" argument to the documentation + /// + /// This is similar to [`Self::with_argument`] except that a standard + /// description is appended to the end: `"Can be a constant, column, or + /// function, and any combination of arithmetic operators."` + /// + /// The argument is rendered like + /// + /// ```text + /// : + /// expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. + /// ``` + pub fn with_standard_argument( + self, + arg_name: impl Into, + expression_type: impl AsRef, + ) -> Self { + let expression_type = expression_type.as_ref(); + self.with_argument(arg_name, format!("{expression_type} expression to operate on. Can be a constant, column, or function, and any combination of operators.")) + } + pub fn with_related_udf(mut self, related_udf: impl Into) -> Self { let mut related = self.related_udfs.unwrap_or_default(); related.push(related_udf.into()); diff --git a/datafusion/functions-aggregate/src/bit_and_or_xor.rs b/datafusion/functions-aggregate/src/bit_and_or_xor.rs index ce36e09bc25b..c5382c168f17 100644 --- a/datafusion/functions-aggregate/src/bit_and_or_xor.rs +++ b/datafusion/functions-aggregate/src/bit_and_or_xor.rs @@ -142,10 +142,7 @@ fn get_bit_and_doc() -> &'static Documentation { .with_doc_section(DOC_SECTION_GENERAL) .with_description("Computes the bitwise AND of all non-null input values.") .with_syntax_example("bit_and(expression)") - .with_argument( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", - ) + .with_standard_argument("expression", "Integer") .build() .unwrap() }) @@ -159,10 +156,7 @@ fn get_bit_or_doc() -> &'static Documentation { .with_doc_section(DOC_SECTION_GENERAL) .with_description("Computes the bitwise OR of all non-null input values.") .with_syntax_example("bit_or(expression)") - .with_argument( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", - ) + .with_standard_argument("expression", "Integer") .build() .unwrap() }) @@ -174,12 +168,11 @@ fn get_bit_xor_doc() -> &'static Documentation { BIT_XOR_DOC.get_or_init(|| { Documentation::builder() .with_doc_section(DOC_SECTION_GENERAL) - .with_description("Computes the bitwise exclusive OR of all non-null input values.") - .with_syntax_example("bit_xor(expression)") - .with_argument( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", + .with_description( + "Computes the bitwise exclusive OR of all non-null input values.", ) + .with_syntax_example("bit_xor(expression)") + .with_standard_argument("expression", "Integer") .build() .unwrap() }) diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs index df3045f22cf5..d603e5bcf295 100644 --- a/datafusion/functions/src/crypto/sha224.rs +++ b/datafusion/functions/src/crypto/sha224.rs @@ -58,8 +58,7 @@ fn get_sha224_doc() -> &'static Documentation { .with_doc_section(DOC_SECTION_HASHING) .with_description("Computes the SHA-224 hash of a binary string.") .with_syntax_example("sha224(expression)") - .with_argument("expression", - "String expression to operate on. Can be a constant, column, or function, and any combination of string operators.") + .with_standard_argument("expression", "String") .build() .unwrap() }) diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 176d7f8bbcbf..2803fd042b99 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -111,10 +111,7 @@ Note: `to_date` returns Date32, which represents its values as the number of day Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/to_date.rs) "#) - .with_argument( - "expression", - "Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.", - ) + .with_standard_argument("expression", "String") .with_argument( "format_n", "Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index 889e3761d26c..07ff8e2166ff 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -57,10 +57,8 @@ fn get_log_doc() -> &'static Documentation { .with_description("Returns the base-x logarithm of a number. Can either provide a specified base, or if omitted then takes the base-10 of a number.") .with_syntax_example(r#"log(base, numeric_expression) log(numeric_expression)"#) - .with_argument("base", - "Base numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") - .with_argument("numeric_expression", - "Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators.") + .with_standard_argument("base", "Base numeric") + .with_standard_argument("numeric_expression", "Numeric") .build() .unwrap() }) diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 2abb4a9376c5..f647b1969168 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -67,10 +67,8 @@ SELECT regexp_like('aBc', '(b|d)', 'i'); ``` Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs) "#) - .with_argument("str", - "String expression to operate on. Can be a constant, column, or function, and any combination of string operators.") - .with_argument("regexp", - "Regular expression to test against the string expression. Can be a constant, column, or function.") + .with_standard_argument("str", "String") + .with_standard_argument("regexp","Regular") .with_argument("flags", r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: - **i**: case-insensitive: letters match both upper and lower case diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index ce221b44f42b..05ecff05a179 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -55,9 +55,9 @@ fn get_rpad_doc() -> &'static Documentation { .with_doc_section(DOC_SECTION_STRING) .with_description("Pads the right side of a string with another string to a specified string length.") .with_syntax_example("rpad(str, n[, padding_str])") - .with_argument( + .with_standard_argument( "str", - "String expression to operate on. Can be a constant, column, or function, and any combination of string operators.", + "String", ) .with_argument("n", "String length to pad to.") .with_argument("padding_str", diff --git a/docs/source/user-guide/sql/aggregate_functions_new.md b/docs/source/user-guide/sql/aggregate_functions_new.md index 8303c50c2471..2c8ebc3be990 100644 --- a/docs/source/user-guide/sql/aggregate_functions_new.md +++ b/docs/source/user-guide/sql/aggregate_functions_new.md @@ -47,7 +47,7 @@ bit_and(expression) #### Arguments -- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **expression**: Integer expression to operate on. Can be a constant, column, or function, and any combination of operators. ### `bit_or` @@ -59,7 +59,7 @@ bit_or(expression) #### Arguments -- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **expression**: Integer expression to operate on. Can be a constant, column, or function, and any combination of operators. ### `bit_xor` @@ -71,4 +71,4 @@ bit_xor(expression) #### Arguments -- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **expression**: Integer expression to operate on. Can be a constant, column, or function, and any combination of operators. diff --git a/docs/source/user-guide/sql/scalar_functions_new.md b/docs/source/user-guide/sql/scalar_functions_new.md index ae2744c1650e..bff2c0f485c3 100644 --- a/docs/source/user-guide/sql/scalar_functions_new.md +++ b/docs/source/user-guide/sql/scalar_functions_new.md @@ -44,8 +44,8 @@ log(numeric_expression) #### Arguments -- **base**: Base numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. -- **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **base**: Base numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **numeric_expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. ## Conditional Functions @@ -94,7 +94,7 @@ rpad(str, n[, padding_str]) #### Arguments -- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of string operators. +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. - **n**: String length to pad to. - **padding_str**: String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._ @@ -160,8 +160,8 @@ regexp_like(str, regexp[, flags]) #### Arguments -- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of string operators. -- **regexp**: Regular expression to test against the string expression. Can be a constant, column, or function. +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **regexp**: Regular expression to operate on. Can be a constant, column, or function, and any combination of operators. - **flags**: Optional regular expression flags that control the behavior of the regular expression. The following flags are supported: - **i**: case-insensitive: letters match both upper and lower case - **m**: multi-line mode: ^ and $ match begin/end of line @@ -208,7 +208,7 @@ to_date('2017-05-31', '%Y-%m-%d') #### Arguments -- **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. +- **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. - **format_n**: Optional [Chrono format](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) strings to use to parse the expression. Formats will be tried in the order they appear with the first successful one being returned. If none of the formats successfully parse the expression an error will be returned. @@ -246,4 +246,4 @@ sha224(expression) #### Arguments -- **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of string operators. +- **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. From 862bb4ae085b08ac8a29e8f8314932631ce27c08 Mon Sep 17 00:00:00 2001 From: kamille Date: Sat, 5 Oct 2024 18:44:34 +0800 Subject: [PATCH 09/22] fix `equal_to` in `PrimitiveGroupValueBuilder` (#12758) * fix `equal_to` in `PrimitiveGroupValueBuilder`. * fix typo. * add uts. * reduce calling of `is_null`. --- .../aggregates/group_values/group_column.rs | 107 ++++++++++++++++-- 1 file changed, 98 insertions(+), 9 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index 15c93262968e..aa246ac95b8b 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -91,15 +91,28 @@ impl GroupColumn for PrimitiveGroupValueBuilder { fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool { - // Perf: skip null check (by short circuit) if input is not ullable - let null_match = if NULLABLE { - self.nulls.is_null(lhs_row) == array.is_null(rhs_row) - } else { - true - }; + // Perf: skip null check (by short circuit) if input is not nullable + if NULLABLE { + // In nullable path, we should check if both `exist row` and `input row` + // are null/not null + let is_exist_null = self.nulls.is_null(lhs_row); + let null_match = is_exist_null == array.is_null(rhs_row); + if !null_match { + // If `is_null`s in `exist row` and `input row` don't match, return not equal to + return false; + } else if is_exist_null { + // If `is_null`s in `exist row` and `input row` match, and they are `null`s, + // return equal to + // + // NOTICE: we should not check their values when they are `null`s, because they are + // meaningless actually, and not ensured to be same + // + return true; + } + // Otherwise, we need to check their values + } - null_match - && self.group_values[lhs_row] == array.as_primitive::().value(rhs_row) + self.group_values[lhs_row] == array.as_primitive::().value(rhs_row) } fn append_val(&mut self, array: &ArrayRef, row: usize) { @@ -373,9 +386,13 @@ where mod tests { use std::sync::Arc; - use arrow_array::{ArrayRef, StringArray}; + use arrow::datatypes::Int64Type; + use arrow_array::{ArrayRef, Int64Array, StringArray}; + use arrow_buffer::{BooleanBufferBuilder, NullBuffer}; use datafusion_physical_expr::binary_map::OutputType; + use crate::aggregates::group_values::group_column::PrimitiveGroupValueBuilder; + use super::{ByteGroupValueBuilder, GroupColumn}; #[test] @@ -422,4 +439,76 @@ mod tests { ])) as ArrayRef; assert_eq!(&output, &array); } + + #[test] + fn test_nullable_primitive_equal_to() { + // Will cover such cases: + // - exist null, input not null + // - exist null, input null; values not equal + // - exist null, input null; values equal + // - exist not null, input null + // - exist not null, input not null; values not equal + // - exist not null, input not null; values equal + + // Define PrimitiveGroupValueBuilder + let mut builder = PrimitiveGroupValueBuilder::::new(); + let builder_array = Arc::new(Int64Array::from(vec![ + None, + None, + None, + Some(1), + Some(2), + Some(3), + ])) as ArrayRef; + builder.append_val(&builder_array, 0); + builder.append_val(&builder_array, 1); + builder.append_val(&builder_array, 2); + builder.append_val(&builder_array, 3); + builder.append_val(&builder_array, 4); + builder.append_val(&builder_array, 5); + + // Define input array + let (_, values, _) = + Int64Array::from(vec![Some(1), Some(2), None, None, Some(1), Some(3)]) + .into_parts(); + + let mut boolean_buffer_builder = BooleanBufferBuilder::new(6); + boolean_buffer_builder.append(true); + boolean_buffer_builder.append(false); + boolean_buffer_builder.append(false); + boolean_buffer_builder.append(false); + boolean_buffer_builder.append(true); + boolean_buffer_builder.append(true); + let nulls = NullBuffer::new(boolean_buffer_builder.finish()); + let input_array = Arc::new(Int64Array::new(values, Some(nulls))) as ArrayRef; + + // Check + assert!(!builder.equal_to(0, &input_array, 0)); + assert!(builder.equal_to(1, &input_array, 1)); + assert!(builder.equal_to(2, &input_array, 2)); + assert!(!builder.equal_to(3, &input_array, 3)); + assert!(!builder.equal_to(4, &input_array, 4)); + assert!(builder.equal_to(5, &input_array, 5)); + } + + #[test] + fn test_not_nullable_primitive_equal_to() { + // Will cover such cases: + // - values equal + // - values not equal + + // Define PrimitiveGroupValueBuilder + let mut builder = PrimitiveGroupValueBuilder::::new(); + let builder_array = + Arc::new(Int64Array::from(vec![Some(0), Some(1)])) as ArrayRef; + builder.append_val(&builder_array, 0); + builder.append_val(&builder_array, 1); + + // Define input array + let input_array = Arc::new(Int64Array::from(vec![Some(0), Some(2)])) as ArrayRef; + + // Check + assert!(builder.equal_to(0, &input_array, 0)); + assert!(!builder.equal_to(1, &input_array, 1)); + } } From 6f8c74ca873fe015b2e7ff4eeb2f76bf21ae9d0e Mon Sep 17 00:00:00 2001 From: jcsherin Date: Sat, 5 Oct 2024 16:43:35 +0530 Subject: [PATCH 10/22] Minor: doc how field name is to be set (#12757) --- datafusion/expr/src/udwf.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/datafusion/expr/src/udwf.rs b/datafusion/expr/src/udwf.rs index 6459e8f3f7d1..69f357d48f8c 100644 --- a/datafusion/expr/src/udwf.rs +++ b/datafusion/expr/src/udwf.rs @@ -355,6 +355,10 @@ pub trait WindowUDFImpl: Debug + Send + Sync { } /// The [`Field`] of the final result of evaluating this window function. + /// + /// Call `field_args.name()` to get the fully qualified name for defining + /// the [`Field`]. For a complete example see the implementation in the + /// [Basic Example](WindowUDFImpl#basic-example) section. fn field(&self, field_args: WindowUDFFieldArgs) -> Result; /// Allows the window UDF to define a custom result ordering. From 18f920135c4ff9cfd081a335d20ee675b2916503 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 6 Oct 2024 06:40:21 -0400 Subject: [PATCH 11/22] Fix `equal_to` in `ByteGroupValueBuilder` (#12770) * Fix `equal_to` in `ByteGroupValueBuilder` * refactor null_equal_to * Update datafusion/physical-plan/src/aggregates/group_values/group_column.rs --- .../aggregates/group_values/group_column.rs | 108 ++++++++++++++---- 1 file changed, 88 insertions(+), 20 deletions(-) diff --git a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs index aa246ac95b8b..5d00f300e960 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/group_column.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/group_column.rs @@ -93,21 +93,10 @@ impl GroupColumn fn equal_to(&self, lhs_row: usize, array: &ArrayRef, rhs_row: usize) -> bool { // Perf: skip null check (by short circuit) if input is not nullable if NULLABLE { - // In nullable path, we should check if both `exist row` and `input row` - // are null/not null - let is_exist_null = self.nulls.is_null(lhs_row); - let null_match = is_exist_null == array.is_null(rhs_row); - if !null_match { - // If `is_null`s in `exist row` and `input row` don't match, return not equal to - return false; - } else if is_exist_null { - // If `is_null`s in `exist row` and `input row` match, and they are `null`s, - // return equal to - // - // NOTICE: we should not check their values when they are `null`s, because they are - // meaningless actually, and not ensured to be same - // - return true; + let exist_null = self.nulls.is_null(lhs_row); + let input_null = array.is_null(rhs_row); + if let Some(result) = nulls_equal_to(exist_null, input_null) { + return result; } // Otherwise, we need to check their values } @@ -224,9 +213,14 @@ where where B: ByteArrayType, { - let arr = array.as_bytes::(); - self.nulls.is_null(lhs_row) == arr.is_null(rhs_row) - && self.value(lhs_row) == (arr.value(rhs_row).as_ref() as &[u8]) + let array = array.as_bytes::(); + let exist_null = self.nulls.is_null(lhs_row); + let input_null = array.is_null(rhs_row); + if let Some(result) = nulls_equal_to(exist_null, input_null) { + return result; + } + // Otherwise, we need to check their values + self.value(lhs_row) == (array.value(rhs_row).as_ref() as &[u8]) } /// return the current value of the specified row irrespective of null @@ -382,6 +376,20 @@ where } } +/// Determines if the nullability of the existing and new input array can be used +/// to short-circuit the comparison of the two values. +/// +/// Returns `Some(result)` if the result of the comparison can be determined +/// from the nullness of the two values, and `None` if the comparison must be +/// done on the values themselves. +fn nulls_equal_to(lhs_null: bool, rhs_null: bool) -> Option { + match (lhs_null, rhs_null) { + (true, true) => Some(true), + (false, true) | (true, false) => Some(false), + _ => None, + } +} + #[cfg(test)] mod tests { use std::sync::Arc; @@ -468,13 +476,14 @@ mod tests { builder.append_val(&builder_array, 5); // Define input array - let (_, values, _) = + let (_nulls, values, _) = Int64Array::from(vec![Some(1), Some(2), None, None, Some(1), Some(3)]) .into_parts(); + // explicitly build a boolean buffer where one of the null values also happens to match let mut boolean_buffer_builder = BooleanBufferBuilder::new(6); boolean_buffer_builder.append(true); - boolean_buffer_builder.append(false); + boolean_buffer_builder.append(false); // this sets Some(2) to null above boolean_buffer_builder.append(false); boolean_buffer_builder.append(false); boolean_buffer_builder.append(true); @@ -511,4 +520,63 @@ mod tests { assert!(builder.equal_to(0, &input_array, 0)); assert!(!builder.equal_to(1, &input_array, 1)); } + + #[test] + fn test_byte_array_equal_to() { + // Will cover such cases: + // - exist null, input not null + // - exist null, input null; values not equal + // - exist null, input null; values equal + // - exist not null, input null + // - exist not null, input not null; values not equal + // - exist not null, input not null; values equal + + // Define PrimitiveGroupValueBuilder + let mut builder = ByteGroupValueBuilder::::new(OutputType::Utf8); + let builder_array = Arc::new(StringArray::from(vec![ + None, + None, + None, + Some("foo"), + Some("bar"), + Some("baz"), + ])) as ArrayRef; + builder.append_val(&builder_array, 0); + builder.append_val(&builder_array, 1); + builder.append_val(&builder_array, 2); + builder.append_val(&builder_array, 3); + builder.append_val(&builder_array, 4); + builder.append_val(&builder_array, 5); + + // Define input array + let (offsets, buffer, _nulls) = StringArray::from(vec![ + Some("foo"), + Some("bar"), + None, + None, + Some("foo"), + Some("baz"), + ]) + .into_parts(); + + // explicitly build a boolean buffer where one of the null values also happens to match + let mut boolean_buffer_builder = BooleanBufferBuilder::new(6); + boolean_buffer_builder.append(true); + boolean_buffer_builder.append(false); // this sets Some("bar") to null above + boolean_buffer_builder.append(false); + boolean_buffer_builder.append(false); + boolean_buffer_builder.append(true); + boolean_buffer_builder.append(true); + let nulls = NullBuffer::new(boolean_buffer_builder.finish()); + let input_array = + Arc::new(StringArray::new(offsets, buffer, Some(nulls))) as ArrayRef; + + // Check + assert!(!builder.equal_to(0, &input_array, 0)); + assert!(builder.equal_to(1, &input_array, 1)); + assert!(builder.equal_to(2, &input_array, 2)); + assert!(!builder.equal_to(3, &input_array, 3)); + assert!(!builder.equal_to(4, &input_array, 4)); + assert!(builder.equal_to(5, &input_array, 5)); + } } From 9bf0630a1870c6eb8fb381c1ba1e793f3b652b2f Mon Sep 17 00:00:00 2001 From: Emil Ejbyfeldt Date: Sun, 6 Oct 2024 12:42:32 +0200 Subject: [PATCH 12/22] Allow simplification even when nullable (#12746) The nullable requirement seem to have been added in #1401 but as far as I can tell they are not needed for these 2 cases. I think this can be shown using this truth table: (generated using datafusion-cli without this patch) ``` > CREATE TABLE t (v BOOLEAN) as values (true), (false), (NULL); > select t.v, t2.v, t.v AND (t.v OR t2.v), t.v OR (t.v AND t2.v) from t cross join t as t2; +-------+-------+---------------------+---------------------+ | v | v | t.v AND t.v OR t2.v | t.v OR t.v AND t2.v | +-------+-------+---------------------+---------------------+ | true | true | true | true | | true | false | true | true | | true | | true | true | | false | true | false | false | | false | false | false | false | | false | | false | false | | | true | | | | | false | | | | | | | | +-------+-------+---------------------+---------------------+ ``` And it seems Spark applies both of these and DuckDB applies only the first one. --- .../simplify_expressions/expr_simplifier.rs | 40 ++++++------------- datafusion/sqllogictest/test_files/cse.slt | 16 ++++---- 2 files changed, 21 insertions(+), 35 deletions(-) diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index a78a54a57123..e8164ce9895b 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -838,22 +838,18 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { op: Or, right, }) if expr_contains(&right, &left, Or) => Transformed::yes(*right), - // A OR (A AND B) --> A (if B not null) + // A OR (A AND B) --> A Expr::BinaryExpr(BinaryExpr { left, op: Or, right, - }) if !info.nullable(&right)? && is_op_with(And, &right, &left) => { - Transformed::yes(*left) - } - // (A AND B) OR A --> A (if B not null) + }) if is_op_with(And, &right, &left) => Transformed::yes(*left), + // (A AND B) OR A --> A Expr::BinaryExpr(BinaryExpr { left, op: Or, right, - }) if !info.nullable(&left)? && is_op_with(And, &left, &right) => { - Transformed::yes(*right) - } + }) if is_op_with(And, &left, &right) => Transformed::yes(*right), // // Rules for AND @@ -911,22 +907,18 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { op: And, right, }) if expr_contains(&right, &left, And) => Transformed::yes(*right), - // A AND (A OR B) --> A (if B not null) + // A AND (A OR B) --> A Expr::BinaryExpr(BinaryExpr { left, op: And, right, - }) if !info.nullable(&right)? && is_op_with(Or, &right, &left) => { - Transformed::yes(*left) - } - // (A OR B) AND A --> A (if B not null) + }) if is_op_with(Or, &right, &left) => Transformed::yes(*left), + // (A OR B) AND A --> A Expr::BinaryExpr(BinaryExpr { left, op: And, right, - }) if !info.nullable(&left)? && is_op_with(Or, &left, &right) => { - Transformed::yes(*right) - } + }) if is_op_with(Or, &left, &right) => Transformed::yes(*right), // // Rules for Multiply @@ -2609,15 +2601,11 @@ mod tests { // (c2 > 5) OR ((c1 < 6) AND (c2 > 5)) let expr = or(l.clone(), r.clone()); - // no rewrites if c1 can be null - let expected = expr.clone(); + let expected = l.clone(); assert_eq!(simplify(expr), expected); // ((c1 < 6) AND (c2 > 5)) OR (c2 > 5) - let expr = or(l, r); - - // no rewrites if c1 can be null - let expected = expr.clone(); + let expr = or(r, l); assert_eq!(simplify(expr), expected); } @@ -2648,13 +2636,11 @@ mod tests { // (c2 > 5) AND ((c1 < 6) OR (c2 > 5)) --> c2 > 5 let expr = and(l.clone(), r.clone()); - // no rewrites if c1 can be null - let expected = expr.clone(); + let expected = l.clone(); assert_eq!(simplify(expr), expected); // ((c1 < 6) OR (c2 > 5)) AND (c2 > 5) --> c2 > 5 - let expr = and(l, r); - let expected = expr.clone(); + let expr = and(r, l); assert_eq!(simplify(expr), expected); } @@ -3223,7 +3209,7 @@ mod tests { )], Some(Box::new(col("c2").eq(lit(true)))), )))), - col("c2").or(col("c2").not().and(col("c2"))) // #1716 + col("c2") ); // CASE WHEN ISNULL(c2) THEN true ELSE c2 diff --git a/datafusion/sqllogictest/test_files/cse.slt b/datafusion/sqllogictest/test_files/cse.slt index 19b47fa50e41..9f0f654179e9 100644 --- a/datafusion/sqllogictest/test_files/cse.slt +++ b/datafusion/sqllogictest/test_files/cse.slt @@ -179,8 +179,8 @@ physical_plan # Surely only once but also conditionally evaluated expressions query TT EXPLAIN SELECT - (a = 1 OR random() = 0) AND a = 1 AS c1, - (a = 2 AND random() = 0) OR a = 2 AS c2, + (a = 1 OR random() = 0) AND a = 2 AS c1, + (a = 2 AND random() = 0) OR a = 1 AS c2, CASE WHEN a + 3 = 0 THEN a + 3 ELSE 0 END AS c3, CASE WHEN a + 4 = 0 THEN 0 WHEN a + 4 THEN 0 ELSE 0 END AS c4, CASE WHEN a + 5 = 0 THEN 0 WHEN random() = 0 THEN a + 5 ELSE 0 END AS c5, @@ -188,11 +188,11 @@ EXPLAIN SELECT FROM t1 ---- logical_plan -01)Projection: (__common_expr_1 OR random() = Float64(0)) AND __common_expr_1 AS c1, __common_expr_2 AND random() = Float64(0) OR __common_expr_2 AS c2, CASE WHEN __common_expr_3 = Float64(0) THEN __common_expr_3 ELSE Float64(0) END AS c3, CASE WHEN __common_expr_4 = Float64(0) THEN Int64(0) WHEN CAST(__common_expr_4 AS Boolean) THEN Int64(0) ELSE Int64(0) END AS c4, CASE WHEN __common_expr_5 = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN __common_expr_5 ELSE Float64(0) END AS c5, CASE WHEN __common_expr_6 = Float64(0) THEN Float64(0) ELSE __common_expr_6 END AS c6 +01)Projection: (__common_expr_1 OR random() = Float64(0)) AND __common_expr_2 AS c1, __common_expr_2 AND random() = Float64(0) OR __common_expr_1 AS c2, CASE WHEN __common_expr_3 = Float64(0) THEN __common_expr_3 ELSE Float64(0) END AS c3, CASE WHEN __common_expr_4 = Float64(0) THEN Int64(0) WHEN CAST(__common_expr_4 AS Boolean) THEN Int64(0) ELSE Int64(0) END AS c4, CASE WHEN __common_expr_5 = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN __common_expr_5 ELSE Float64(0) END AS c5, CASE WHEN __common_expr_6 = Float64(0) THEN Float64(0) ELSE __common_expr_6 END AS c6 02)--Projection: t1.a = Float64(1) AS __common_expr_1, t1.a = Float64(2) AS __common_expr_2, t1.a + Float64(3) AS __common_expr_3, t1.a + Float64(4) AS __common_expr_4, t1.a + Float64(5) AS __common_expr_5, t1.a + Float64(6) AS __common_expr_6 03)----TableScan: t1 projection=[a] physical_plan -01)ProjectionExec: expr=[(__common_expr_1@0 OR random() = 0) AND __common_expr_1@0 as c1, __common_expr_2@1 AND random() = 0 OR __common_expr_2@1 as c2, CASE WHEN __common_expr_3@2 = 0 THEN __common_expr_3@2 ELSE 0 END as c3, CASE WHEN __common_expr_4@3 = 0 THEN 0 WHEN CAST(__common_expr_4@3 AS Boolean) THEN 0 ELSE 0 END as c4, CASE WHEN __common_expr_5@4 = 0 THEN 0 WHEN random() = 0 THEN __common_expr_5@4 ELSE 0 END as c5, CASE WHEN __common_expr_6@5 = 0 THEN 0 ELSE __common_expr_6@5 END as c6] +01)ProjectionExec: expr=[(__common_expr_1@0 OR random() = 0) AND __common_expr_2@1 as c1, __common_expr_2@1 AND random() = 0 OR __common_expr_1@0 as c2, CASE WHEN __common_expr_3@2 = 0 THEN __common_expr_3@2 ELSE 0 END as c3, CASE WHEN __common_expr_4@3 = 0 THEN 0 WHEN CAST(__common_expr_4@3 AS Boolean) THEN 0 ELSE 0 END as c4, CASE WHEN __common_expr_5@4 = 0 THEN 0 WHEN random() = 0 THEN __common_expr_5@4 ELSE 0 END as c5, CASE WHEN __common_expr_6@5 = 0 THEN 0 ELSE __common_expr_6@5 END as c6] 02)--ProjectionExec: expr=[a@0 = 1 as __common_expr_1, a@0 = 2 as __common_expr_2, a@0 + 3 as __common_expr_3, a@0 + 4 as __common_expr_4, a@0 + 5 as __common_expr_5, a@0 + 6 as __common_expr_6] 03)----MemoryExec: partitions=1, partition_sizes=[0] @@ -217,8 +217,8 @@ physical_plan # Only conditionally evaluated expressions query TT EXPLAIN SELECT - (random() = 0 OR a = 1) AND a = 1 AS c1, - (random() = 0 AND a = 2) OR a = 2 AS c2, + (random() = 0 OR a = 1) AND a = 2 AS c1, + (random() = 0 AND a = 2) OR a = 1 AS c2, CASE WHEN random() = 0 THEN a + 3 ELSE a + 3 END AS c3, CASE WHEN random() = 0 THEN 0 WHEN a + 4 = 0 THEN a + 4 ELSE 0 END AS c4, CASE WHEN random() = 0 THEN 0 WHEN a + 5 = 0 THEN 0 ELSE a + 5 END AS c5, @@ -226,8 +226,8 @@ EXPLAIN SELECT FROM t1 ---- logical_plan -01)Projection: (random() = Float64(0) OR t1.a = Float64(1)) AND t1.a = Float64(1) AS c1, random() = Float64(0) AND t1.a = Float64(2) OR t1.a = Float64(2) AS c2, CASE WHEN random() = Float64(0) THEN t1.a + Float64(3) ELSE t1.a + Float64(3) END AS c3, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(4) = Float64(0) THEN t1.a + Float64(4) ELSE Float64(0) END AS c4, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(5) = Float64(0) THEN Float64(0) ELSE t1.a + Float64(5) END AS c5, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN t1.a + Float64(6) ELSE t1.a + Float64(6) END AS c6 +01)Projection: (random() = Float64(0) OR t1.a = Float64(1)) AND t1.a = Float64(2) AS c1, random() = Float64(0) AND t1.a = Float64(2) OR t1.a = Float64(1) AS c2, CASE WHEN random() = Float64(0) THEN t1.a + Float64(3) ELSE t1.a + Float64(3) END AS c3, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(4) = Float64(0) THEN t1.a + Float64(4) ELSE Float64(0) END AS c4, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(5) = Float64(0) THEN Float64(0) ELSE t1.a + Float64(5) END AS c5, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN t1.a + Float64(6) ELSE t1.a + Float64(6) END AS c6 02)--TableScan: t1 projection=[a] physical_plan -01)ProjectionExec: expr=[(random() = 0 OR a@0 = 1) AND a@0 = 1 as c1, random() = 0 AND a@0 = 2 OR a@0 = 2 as c2, CASE WHEN random() = 0 THEN a@0 + 3 ELSE a@0 + 3 END as c3, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 4 = 0 THEN a@0 + 4 ELSE 0 END as c4, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 5 = 0 THEN 0 ELSE a@0 + 5 END as c5, CASE WHEN random() = 0 THEN 0 WHEN random() = 0 THEN a@0 + 6 ELSE a@0 + 6 END as c6] +01)ProjectionExec: expr=[(random() = 0 OR a@0 = 1) AND a@0 = 2 as c1, random() = 0 AND a@0 = 2 OR a@0 = 1 as c2, CASE WHEN random() = 0 THEN a@0 + 3 ELSE a@0 + 3 END as c3, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 4 = 0 THEN a@0 + 4 ELSE 0 END as c4, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 5 = 0 THEN 0 ELSE a@0 + 5 END as c5, CASE WHEN random() = 0 THEN 0 WHEN random() = 0 THEN a@0 + 6 ELSE a@0 + 6 END as c6] 02)--MemoryExec: partitions=1, partition_sizes=[0] From ecb0044f846d48ba2bbf4a164e36fa8fc8ff9997 Mon Sep 17 00:00:00 2001 From: Jax Liu Date: Sun, 6 Oct 2024 19:08:36 +0800 Subject: [PATCH 13/22] Fix unnest conjunction with selecting wildcard expression (#12760) * fix unnest statement with wildcard expression * add commnets --- datafusion/expr/src/utils.rs | 10 ++++ datafusion/sql/src/utils.rs | 4 +- datafusion/sqllogictest/test_files/unnest.slt | 54 ++++++++++++++++++- 3 files changed, 66 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 9bb53a1d04a0..fa927595040e 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -19,6 +19,7 @@ use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; +use std::ops::Deref; use std::sync::Arc; use crate::expr::{Alias, Sort, WildcardOptions, WindowFunction}; @@ -754,6 +755,15 @@ pub fn find_base_plan(input: &LogicalPlan) -> &LogicalPlan { match input { LogicalPlan::Window(window) => find_base_plan(&window.input), LogicalPlan::Aggregate(agg) => find_base_plan(&agg.input), + // [SqlToRel::try_process_unnest] will convert Expr(Unnest(Expr)) to Projection/Unnest/Projection + // We should expand the wildcard expression based on the input plan of the inner Projection. + LogicalPlan::Unnest(unnest) => { + if let LogicalPlan::Projection(projection) = unnest.input.deref() { + find_base_plan(&projection.input) + } else { + input + } + } LogicalPlan::Filter(filter) => { if filter.having { // If a filter is used for a having clause, its input plan is an aggregation. diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 656e4b851aa8..d8ad964be213 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -619,7 +619,9 @@ pub(crate) fn rewrite_recursive_unnest_bottom_up( } = original_expr.clone().rewrite(&mut rewriter)?; if !transformed { - if matches!(&transformed_expr, Expr::Column(_)) { + if matches!(&transformed_expr, Expr::Column(_)) + || matches!(&transformed_expr, Expr::Wildcard { .. }) + { push_projection_dedupl(inner_projection_exprs, transformed_expr.clone()); Ok(vec![transformed_expr]) } else { diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 63ca74e9714c..9e79805d52fb 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -33,7 +33,7 @@ AS VALUES statement ok CREATE TABLE nested_unnest_table AS VALUES - (struct('a', 'b', struct('c')), (struct('a', 'b', [10,20])), [struct('a', 'b')]), + (struct('a', 'b', struct('c')), (struct('a', 'b', [10,20])), [struct('a', 'b')]), (struct('d', 'e', struct('f')), (struct('x', 'y', [30,40, 50])), null) ; @@ -780,3 +780,55 @@ NULL 1 ### TODO: group by unnest struct query error DataFusion error: Error during planning: Projection references non\-aggregate values select unnest(column1) c1 from nested_unnest_table group by c1.c0; + +query II??I?? +select unnest(column5), * from unnest_table; +---- +1 2 [1, 2, 3] [7] 1 [13, 14] {c0: 1, c1: 2} +3 4 [4, 5] [8, 9, 10] 2 [15, 16] {c0: 3, c1: 4} +NULL NULL [6] [11, 12] 3 NULL NULL +7 8 [12] [, 42, ] NULL NULL {c0: 7, c1: 8} +NULL NULL NULL NULL 4 [17, 18] NULL + +query TT???? +select unnest(column1), * from nested_unnest_table +---- +a b {c0: c} {c0: a, c1: b, c2: {c0: c}} {c0: a, c1: b, c2: [10, 20]} [{c0: a, c1: b}] +d e {c0: f} {c0: d, c1: e, c2: {c0: f}} {c0: x, c1: y, c2: [30, 40, 50]} NULL + +query ????? +select unnest(unnest(column3)), * from recursive_unnest_table +---- +[1] [[1, 2]] {c0: [1], c1: a} [[[1], [2]], [[1, 1]]] [{c0: [1], c1: [[1, 2]]}] +[2] [[3], [4]] {c0: [2], c1: b} [[[3, 4], [5]], [[, 6], , [7, 8]]] [{c0: [2], c1: [[3], [4]]}] + +statement ok +CREATE TABLE join_table +AS VALUES + (1, 2, 3), + (2, 3, 4), + (4, 5, 6) +; + +query IIIII +select unnest(u.column5), j.* from unnest_table u join join_table j on u.column3 = j.column1 +---- +1 2 1 2 3 +3 4 2 3 4 +NULL NULL 4 5 6 + +query II?I? +select unnest(column5), * except (column5, column1) from unnest_table; +---- +1 2 [7] 1 [13, 14] +3 4 [8, 9, 10] 2 [15, 16] +NULL NULL [11, 12] 3 NULL +7 8 [, 42, ] NULL NULL +NULL NULL NULL 4 [17, 18] + +query III +select unnest(u.column5), j.* except(column2, column3) from unnest_table u join join_table j on u.column3 = j.column1 +---- +1 2 1 +3 4 2 +NULL NULL 4 From 9b492c6a5e168171f14d4e985fcb43b535c2e872 Mon Sep 17 00:00:00 2001 From: Sergei Grebnov Date: Sun, 6 Oct 2024 04:12:50 -0700 Subject: [PATCH 14/22] Improve `round` scalar function unparsing for Postgres (#12744) * Postgres: enforce required `NUMERIC` type for `round` scalar function (#34) Includes initial support for dialects to override scalar functions unparsing * Document scalar_function_to_sql_overrides fn --- datafusion/sql/src/unparser/dialect.rs | 119 ++++++++++++++- datafusion/sql/src/unparser/expr.rs | 198 +++++++++---------------- datafusion/sql/src/unparser/utils.rs | 82 +++++++++- 3 files changed, 273 insertions(+), 126 deletions(-) diff --git a/datafusion/sql/src/unparser/dialect.rs b/datafusion/sql/src/unparser/dialect.rs index d8a4fb254264..609e6f2240e1 100644 --- a/datafusion/sql/src/unparser/dialect.rs +++ b/datafusion/sql/src/unparser/dialect.rs @@ -18,12 +18,17 @@ use std::sync::Arc; use arrow_schema::TimeUnit; +use datafusion_expr::Expr; use regex::Regex; use sqlparser::{ - ast::{self, Ident, ObjectName, TimezoneInfo}, + ast::{self, Function, Ident, ObjectName, TimezoneInfo}, keywords::ALL_KEYWORDS, }; +use datafusion_common::Result; + +use super::{utils::date_part_to_sql, Unparser}; + /// `Dialect` to use for Unparsing /// /// The default dialect tries to avoid quoting identifiers unless necessary (e.g. `a` instead of `"a"`) @@ -108,6 +113,18 @@ pub trait Dialect: Send + Sync { fn supports_column_alias_in_table_alias(&self) -> bool { true } + + /// Allows the dialect to override scalar function unparsing if the dialect has specific rules. + /// Returns None if the default unparsing should be used, or Some(ast::Expr) if there is + /// a custom implementation for the function. + fn scalar_function_to_sql_overrides( + &self, + _unparser: &Unparser, + _func_name: &str, + _args: &[Expr], + ) -> Result> { + Ok(None) + } } /// `IntervalStyle` to use for unparsing @@ -171,6 +188,67 @@ impl Dialect for PostgreSqlDialect { fn float64_ast_dtype(&self) -> sqlparser::ast::DataType { sqlparser::ast::DataType::DoublePrecision } + + fn scalar_function_to_sql_overrides( + &self, + unparser: &Unparser, + func_name: &str, + args: &[Expr], + ) -> Result> { + if func_name == "round" { + return Ok(Some( + self.round_to_sql_enforce_numeric(unparser, func_name, args)?, + )); + } + + Ok(None) + } +} + +impl PostgreSqlDialect { + fn round_to_sql_enforce_numeric( + &self, + unparser: &Unparser, + func_name: &str, + args: &[Expr], + ) -> Result { + let mut args = unparser.function_args_to_sql(args)?; + + // Enforce the first argument to be Numeric + if let Some(ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(expr))) = + args.first_mut() + { + if let ast::Expr::Cast { data_type, .. } = expr { + // Don't create an additional cast wrapper if we can update the existing one + *data_type = ast::DataType::Numeric(ast::ExactNumberInfo::None); + } else { + // Wrap the expression in a new cast + *expr = ast::Expr::Cast { + kind: ast::CastKind::Cast, + expr: Box::new(expr.clone()), + data_type: ast::DataType::Numeric(ast::ExactNumberInfo::None), + format: None, + }; + } + } + + Ok(ast::Expr::Function(Function { + name: ast::ObjectName(vec![Ident { + value: func_name.to_string(), + quote_style: None, + }]), + args: ast::FunctionArguments::List(ast::FunctionArgumentList { + duplicate_treatment: None, + args, + clauses: vec![], + }), + filter: None, + null_treatment: None, + over: None, + within_group: vec![], + parameters: ast::FunctionArguments::None, + })) + } } pub struct MySqlDialect {} @@ -211,6 +289,19 @@ impl Dialect for MySqlDialect { ) -> ast::DataType { ast::DataType::Datetime(None) } + + fn scalar_function_to_sql_overrides( + &self, + unparser: &Unparser, + func_name: &str, + args: &[Expr], + ) -> Result> { + if func_name == "date_part" { + return date_part_to_sql(unparser, self.date_field_extract_style(), args); + } + + Ok(None) + } } pub struct SqliteDialect {} @@ -231,6 +322,19 @@ impl Dialect for SqliteDialect { fn supports_column_alias_in_table_alias(&self) -> bool { false } + + fn scalar_function_to_sql_overrides( + &self, + unparser: &Unparser, + func_name: &str, + args: &[Expr], + ) -> Result> { + if func_name == "date_part" { + return date_part_to_sql(unparser, self.date_field_extract_style(), args); + } + + Ok(None) + } } pub struct CustomDialect { @@ -339,6 +443,19 @@ impl Dialect for CustomDialect { fn supports_column_alias_in_table_alias(&self) -> bool { self.supports_column_alias_in_table_alias } + + fn scalar_function_to_sql_overrides( + &self, + unparser: &Unparser, + func_name: &str, + args: &[Expr], + ) -> Result> { + if func_name == "date_part" { + return date_part_to_sql(unparser, self.date_field_extract_style(), args); + } + + Ok(None) + } } /// `CustomDialectBuilder` to build `CustomDialect` using builder pattern diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index b924268a7657..537ac2274424 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -15,16 +15,15 @@ // specific language governing permissions and limitations // under the License. -use datafusion_expr::ScalarUDF; use sqlparser::ast::Value::SingleQuotedString; use sqlparser::ast::{ - self, BinaryOperator, Expr as AstExpr, Function, FunctionArg, Ident, Interval, - ObjectName, TimezoneInfo, UnaryOperator, + self, BinaryOperator, Expr as AstExpr, Function, Ident, Interval, ObjectName, + TimezoneInfo, UnaryOperator, }; use std::sync::Arc; use std::vec; -use super::dialect::{DateFieldExtractStyle, IntervalStyle}; +use super::dialect::IntervalStyle; use super::Unparser; use arrow::datatypes::{Decimal128Type, Decimal256Type, DecimalType}; use arrow::util::display::array_value_to_string; @@ -116,47 +115,14 @@ impl Unparser<'_> { Expr::ScalarFunction(ScalarFunction { func, args }) => { let func_name = func.name(); - if let Some(expr) = - self.scalar_function_to_sql_overrides(func_name, func, args) + if let Some(expr) = self + .dialect + .scalar_function_to_sql_overrides(self, func_name, args)? { return Ok(expr); } - let args = args - .iter() - .map(|e| { - if matches!( - e, - Expr::Wildcard { - qualifier: None, - .. - } - ) { - Ok(FunctionArg::Unnamed(ast::FunctionArgExpr::Wildcard)) - } else { - self.expr_to_sql_inner(e).map(|e| { - FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(e)) - }) - } - }) - .collect::>>()?; - - Ok(ast::Expr::Function(Function { - name: ast::ObjectName(vec![Ident { - value: func_name.to_string(), - quote_style: None, - }]), - args: ast::FunctionArguments::List(ast::FunctionArgumentList { - duplicate_treatment: None, - args, - clauses: vec![], - }), - filter: None, - null_treatment: None, - over: None, - within_group: vec![], - parameters: ast::FunctionArguments::None, - })) + self.scalar_function_to_sql(func_name, args) } Expr::Between(Between { expr, @@ -508,6 +474,30 @@ impl Unparser<'_> { } } + pub fn scalar_function_to_sql( + &self, + func_name: &str, + args: &[Expr], + ) -> Result { + let args = self.function_args_to_sql(args)?; + Ok(ast::Expr::Function(Function { + name: ast::ObjectName(vec![Ident { + value: func_name.to_string(), + quote_style: None, + }]), + args: ast::FunctionArguments::List(ast::FunctionArgumentList { + duplicate_treatment: None, + args, + clauses: vec![], + }), + filter: None, + null_treatment: None, + over: None, + within_group: vec![], + parameters: ast::FunctionArguments::None, + })) + } + pub fn sort_to_sql(&self, sort: &Sort) -> Result { let Sort { expr, @@ -530,87 +520,6 @@ impl Unparser<'_> { }) } - fn scalar_function_to_sql_overrides( - &self, - func_name: &str, - _func: &Arc, - args: &[Expr], - ) -> Option { - if func_name.to_lowercase() == "date_part" { - match (self.dialect.date_field_extract_style(), args.len()) { - (DateFieldExtractStyle::Extract, 2) => { - let date_expr = self.expr_to_sql(&args[1]).ok()?; - - if let Expr::Literal(ScalarValue::Utf8(Some(field))) = &args[0] { - let field = match field.to_lowercase().as_str() { - "year" => ast::DateTimeField::Year, - "month" => ast::DateTimeField::Month, - "day" => ast::DateTimeField::Day, - "hour" => ast::DateTimeField::Hour, - "minute" => ast::DateTimeField::Minute, - "second" => ast::DateTimeField::Second, - _ => return None, - }; - - return Some(ast::Expr::Extract { - field, - expr: Box::new(date_expr), - syntax: ast::ExtractSyntax::From, - }); - } - } - (DateFieldExtractStyle::Strftime, 2) => { - let column = self.expr_to_sql(&args[1]).ok()?; - - if let Expr::Literal(ScalarValue::Utf8(Some(field))) = &args[0] { - let field = match field.to_lowercase().as_str() { - "year" => "%Y", - "month" => "%m", - "day" => "%d", - "hour" => "%H", - "minute" => "%M", - "second" => "%S", - _ => return None, - }; - - return Some(ast::Expr::Function(ast::Function { - name: ast::ObjectName(vec![ast::Ident { - value: "strftime".to_string(), - quote_style: None, - }]), - args: ast::FunctionArguments::List( - ast::FunctionArgumentList { - duplicate_treatment: None, - args: vec![ - ast::FunctionArg::Unnamed( - ast::FunctionArgExpr::Expr(ast::Expr::Value( - ast::Value::SingleQuotedString( - field.to_string(), - ), - )), - ), - ast::FunctionArg::Unnamed( - ast::FunctionArgExpr::Expr(column), - ), - ], - clauses: vec![], - }, - ), - filter: None, - null_treatment: None, - over: None, - within_group: vec![], - parameters: ast::FunctionArguments::None, - })); - } - } - _ => {} // no overrides for DateFieldExtractStyle::DatePart, because it's already a date_part - } - } - - None - } - fn ast_type_for_date64_in_cast(&self) -> ast::DataType { if self.dialect.use_timestamp_for_date64() { ast::DataType::Timestamp(None, ast::TimezoneInfo::None) @@ -665,7 +574,10 @@ impl Unparser<'_> { } } - fn function_args_to_sql(&self, args: &[Expr]) -> Result> { + pub(crate) fn function_args_to_sql( + &self, + args: &[Expr], + ) -> Result> { args.iter() .map(|e| { if matches!( @@ -1554,7 +1466,10 @@ mod tests { use datafusion_functions_aggregate::expr_fn::sum; use datafusion_functions_window::row_number::row_number_udwf; - use crate::unparser::dialect::{CustomDialect, CustomDialectBuilder}; + use crate::unparser::dialect::{ + CustomDialect, CustomDialectBuilder, DateFieldExtractStyle, Dialect, + PostgreSqlDialect, + }; use super::*; @@ -2428,4 +2343,39 @@ mod tests { assert_eq!(actual, expected); } } + + #[test] + fn test_round_scalar_fn_to_expr() -> Result<()> { + let default_dialect: Arc = Arc::new( + CustomDialectBuilder::new() + .with_identifier_quote_style('"') + .build(), + ); + let postgres_dialect: Arc = Arc::new(PostgreSqlDialect {}); + + for (dialect, identifier) in + [(default_dialect, "DOUBLE"), (postgres_dialect, "NUMERIC")] + { + let unparser = Unparser::new(dialect.as_ref()); + let expr = Expr::ScalarFunction(ScalarFunction { + func: Arc::new(ScalarUDF::from( + datafusion_functions::math::round::RoundFunc::new(), + )), + args: vec![ + Expr::Cast(Cast { + expr: Box::new(col("a")), + data_type: DataType::Float64, + }), + Expr::Literal(ScalarValue::Int64(Some(2))), + ], + }); + let ast = unparser.expr_to_sql(&expr)?; + + let actual = format!("{}", ast); + let expected = format!(r#"round(CAST("a" AS {identifier}), 2)"#); + + assert_eq!(actual, expected); + } + Ok(()) + } } diff --git a/datafusion/sql/src/unparser/utils.rs b/datafusion/sql/src/unparser/utils.rs index 0059aba25738..8b2530a7499b 100644 --- a/datafusion/sql/src/unparser/utils.rs +++ b/datafusion/sql/src/unparser/utils.rs @@ -18,11 +18,14 @@ use datafusion_common::{ internal_err, tree_node::{Transformed, TreeNode}, - Column, DataFusionError, Result, + Column, DataFusionError, Result, ScalarValue, }; use datafusion_expr::{ utils::grouping_set_to_exprlist, Aggregate, Expr, LogicalPlan, Window, }; +use sqlparser::ast; + +use super::{dialect::DateFieldExtractStyle, Unparser}; /// Recursively searches children of [LogicalPlan] to find an Aggregate node if exists /// prior to encountering a Join, TableScan, or a nested subquery (derived table factor). @@ -187,3 +190,80 @@ fn find_window_expr<'a>( .flat_map(|w| w.window_expr.iter()) .find(|expr| expr.schema_name().to_string() == column_name) } + +/// Converts a date_part function to SQL, tailoring it to the supported date field extraction style. +pub(crate) fn date_part_to_sql( + unparser: &Unparser, + style: DateFieldExtractStyle, + date_part_args: &[Expr], +) -> Result> { + match (style, date_part_args.len()) { + (DateFieldExtractStyle::Extract, 2) => { + let date_expr = unparser.expr_to_sql(&date_part_args[1])?; + if let Expr::Literal(ScalarValue::Utf8(Some(field))) = &date_part_args[0] { + let field = match field.to_lowercase().as_str() { + "year" => ast::DateTimeField::Year, + "month" => ast::DateTimeField::Month, + "day" => ast::DateTimeField::Day, + "hour" => ast::DateTimeField::Hour, + "minute" => ast::DateTimeField::Minute, + "second" => ast::DateTimeField::Second, + _ => return Ok(None), + }; + + return Ok(Some(ast::Expr::Extract { + field, + expr: Box::new(date_expr), + syntax: ast::ExtractSyntax::From, + })); + } + } + (DateFieldExtractStyle::Strftime, 2) => { + let column = unparser.expr_to_sql(&date_part_args[1])?; + + if let Expr::Literal(ScalarValue::Utf8(Some(field))) = &date_part_args[0] { + let field = match field.to_lowercase().as_str() { + "year" => "%Y", + "month" => "%m", + "day" => "%d", + "hour" => "%H", + "minute" => "%M", + "second" => "%S", + _ => return Ok(None), + }; + + return Ok(Some(ast::Expr::Function(ast::Function { + name: ast::ObjectName(vec![ast::Ident { + value: "strftime".to_string(), + quote_style: None, + }]), + args: ast::FunctionArguments::List(ast::FunctionArgumentList { + duplicate_treatment: None, + args: vec![ + ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr( + ast::Expr::Value(ast::Value::SingleQuotedString( + field.to_string(), + )), + )), + ast::FunctionArg::Unnamed(ast::FunctionArgExpr::Expr(column)), + ], + clauses: vec![], + }), + filter: None, + null_treatment: None, + over: None, + within_group: vec![], + parameters: ast::FunctionArguments::None, + }))); + } + } + (DateFieldExtractStyle::DatePart, _) => { + return Ok(Some( + unparser.scalar_function_to_sql("date_part", date_part_args)?, + )); + } + _ => {} + }; + + Ok(None) +} From 84c94097896e9d08e40d63e8c505414ea70e8b29 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 6 Oct 2024 14:33:35 -0400 Subject: [PATCH 15/22] Fix stack overflow calculating projected orderings (#12759) * Fix stack overflow calculating projected orderings * fix docs --- .../src/equivalence/properties.rs | 162 +++++++++++++----- 1 file changed, 121 insertions(+), 41 deletions(-) diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs index 51e85e25e077..1c12f4697005 100644 --- a/datafusion/physical-expr/src/equivalence/properties.rs +++ b/datafusion/physical-expr/src/equivalence/properties.rs @@ -1295,21 +1295,30 @@ fn construct_prefix_orderings( relevant_sort_expr: &PhysicalSortExpr, dependency_map: &DependencyMap, ) -> Vec { + let mut dep_enumerator = DependencyEnumerator::new(); dependency_map[relevant_sort_expr] .dependencies .iter() - .flat_map(|dep| construct_orderings(dep, dependency_map)) + .flat_map(|dep| dep_enumerator.construct_orderings(dep, dependency_map)) .collect() } -/// Given a set of relevant dependencies (`relevant_deps`) and a map of dependencies -/// (`dependency_map`), this function generates all possible prefix orderings -/// based on the given dependencies. +/// Generates all possible orderings where dependencies are satisfied for the +/// current projection expression. +/// +/// # Examaple +/// If `dependences` is `a + b ASC` and the dependency map holds dependencies +/// * `a ASC` --> `[c ASC]` +/// * `b ASC` --> `[d DESC]`, +/// +/// This function generates these two sort orders +/// * `[c ASC, d DESC, a + b ASC]` +/// * `[d DESC, c ASC, a + b ASC]` /// /// # Parameters /// -/// * `dependencies` - A reference to the dependencies. -/// * `dependency_map` - A reference to the map of dependencies for expressions. +/// * `dependencies` - Set of relevant expressions. +/// * `dependency_map` - Map of dependencies for expressions that may appear in `dependencies` /// /// # Returns /// @@ -1335,11 +1344,6 @@ fn generate_dependency_orderings( return vec![vec![]]; } - // Generate all possible orderings where dependencies are satisfied for the - // current projection expression. For example, if expression is `a + b ASC`, - // and the dependency for `a ASC` is `[c ASC]`, the dependency for `b ASC` - // is `[d DESC]`, then we generate `[c ASC, d DESC, a + b ASC]` and - // `[d DESC, c ASC, a + b ASC]`. relevant_prefixes .into_iter() .multi_cartesian_product() @@ -1421,7 +1425,7 @@ struct DependencyNode { } impl DependencyNode { - // Insert dependency to the state (if exists). + /// Insert dependency to the state (if exists). fn insert_dependency(&mut self, dependency: Option<&PhysicalSortExpr>) { if let Some(dep) = dependency { self.dependencies.insert(dep.clone()); @@ -1437,38 +1441,69 @@ impl DependencyNode { type DependencyMap = IndexMap; type Dependencies = IndexSet; -/// This function recursively analyzes the dependencies of the given sort -/// expression within the given dependency map to construct lexicographical -/// orderings that include the sort expression and its dependencies. -/// -/// # Parameters -/// -/// - `referred_sort_expr`: A reference to the sort expression (`PhysicalSortExpr`) -/// for which lexicographical orderings satisfying its dependencies are to be -/// constructed. -/// - `dependency_map`: A reference to the `DependencyMap` that contains -/// dependencies for different `PhysicalSortExpr`s. -/// -/// # Returns -/// -/// A vector of lexicographical orderings (`Vec`) based on the given -/// sort expression and its dependencies. -fn construct_orderings( - referred_sort_expr: &PhysicalSortExpr, - dependency_map: &DependencyMap, -) -> Vec { - // We are sure that `referred_sort_expr` is inside `dependency_map`. - let node = &dependency_map[referred_sort_expr]; - // Since we work on intermediate nodes, we are sure `val.target_sort_expr` - // exists. - let target_sort_expr = node.target_sort_expr.clone().unwrap(); - if node.dependencies.is_empty() { - vec![vec![target_sort_expr]] - } else { +/// Contains a mapping of all dependencies we have processed for each sort expr +struct DependencyEnumerator<'a> { + /// Maps `expr` --> `[exprs]` that have previously been processed + seen: IndexMap<&'a PhysicalSortExpr, IndexSet<&'a PhysicalSortExpr>>, +} + +impl<'a> DependencyEnumerator<'a> { + fn new() -> Self { + Self { + seen: IndexMap::new(), + } + } + + /// Insert a new dependency, + /// + /// returns false if the dependency was already in the map + /// returns true if the dependency was newly inserted + fn insert( + &mut self, + target: &'a PhysicalSortExpr, + dep: &'a PhysicalSortExpr, + ) -> bool { + self.seen.entry(target).or_default().insert(dep) + } + + /// This function recursively analyzes the dependencies of the given sort + /// expression within the given dependency map to construct lexicographical + /// orderings that include the sort expression and its dependencies. + /// + /// # Parameters + /// + /// - `referred_sort_expr`: A reference to the sort expression (`PhysicalSortExpr`) + /// for which lexicographical orderings satisfying its dependencies are to be + /// constructed. + /// - `dependency_map`: A reference to the `DependencyMap` that contains + /// dependencies for different `PhysicalSortExpr`s. + /// + /// # Returns + /// + /// A vector of lexicographical orderings (`Vec`) based on the given + /// sort expression and its dependencies. + fn construct_orderings( + &mut self, + referred_sort_expr: &'a PhysicalSortExpr, + dependency_map: &'a DependencyMap, + ) -> Vec { + // We are sure that `referred_sort_expr` is inside `dependency_map`. + let node = &dependency_map[referred_sort_expr]; + // Since we work on intermediate nodes, we are sure `val.target_sort_expr` + // exists. + let target_sort_expr = node.target_sort_expr.as_ref().unwrap(); + if node.dependencies.is_empty() { + return vec![vec![target_sort_expr.clone()]]; + }; + node.dependencies .iter() .flat_map(|dep| { - let mut orderings = construct_orderings(dep, dependency_map); + let mut orderings = if self.insert(target_sort_expr, dep) { + self.construct_orderings(dep, dependency_map) + } else { + vec![] + }; for ordering in orderings.iter_mut() { ordering.push(target_sort_expr.clone()) } @@ -1763,6 +1798,51 @@ mod tests { Ok(()) } + #[test] + fn project_equivalence_properties_test_multi() -> Result<()> { + // test multiple input orderings with equivalence properties + let input_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Int64, true), + Field::new("c", DataType::Int64, true), + Field::new("d", DataType::Int64, true), + ])); + + let mut input_properties = EquivalenceProperties::new(Arc::clone(&input_schema)); + // add equivalent ordering [a, b, c, d] + input_properties.add_new_ordering(vec![ + parse_sort_expr("a", &input_schema), + parse_sort_expr("b", &input_schema), + parse_sort_expr("c", &input_schema), + parse_sort_expr("d", &input_schema), + ]); + + // add equivalent ordering [a, c, b, d] + input_properties.add_new_ordering(vec![ + parse_sort_expr("a", &input_schema), + parse_sort_expr("c", &input_schema), + parse_sort_expr("b", &input_schema), // NB b and c are swapped + parse_sort_expr("d", &input_schema), + ]); + + // simply project all the columns in order + let proj_exprs = vec![ + (col("a", &input_schema)?, "a".to_string()), + (col("b", &input_schema)?, "b".to_string()), + (col("c", &input_schema)?, "c".to_string()), + (col("d", &input_schema)?, "d".to_string()), + ]; + let projection_mapping = ProjectionMapping::try_new(&proj_exprs, &input_schema)?; + let out_properties = input_properties.project(&projection_mapping, input_schema); + + assert_eq!( + out_properties.to_string(), + "order: [[a@0 ASC,c@2 ASC,b@1 ASC,d@3 ASC], [a@0 ASC,b@1 ASC,c@2 ASC,d@3 ASC]]" + ); + + Ok(()) + } + #[test] fn test_join_equivalence_properties() -> Result<()> { let schema = create_test_schema()?; From b3bf3af36eff83addca83507ef18a58b52b533bb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 7 Oct 2024 06:41:52 -0400 Subject: [PATCH 16/22] Port / Add Documentation for `VarianceSample` and `VariancePopulation` (#12742) --- .../functions-aggregate/src/variance.rs | 47 ++++++++++- .../user-guide/sql/aggregate_functions.md | 42 ---------- .../user-guide/sql/aggregate_functions_new.md | 81 +++++++++++++++++++ 3 files changed, 125 insertions(+), 45 deletions(-) diff --git a/datafusion/functions-aggregate/src/variance.rs b/datafusion/functions-aggregate/src/variance.rs index 3648ec0d1312..49a30344c212 100644 --- a/datafusion/functions-aggregate/src/variance.rs +++ b/datafusion/functions-aggregate/src/variance.rs @@ -18,22 +18,24 @@ //! [`VarianceSample`]: variance sample aggregations. //! [`VariancePopulation`]: variance population aggregations. -use std::{fmt::Debug, sync::Arc}; - use arrow::{ array::{Array, ArrayRef, BooleanArray, Float64Array, UInt64Array}, buffer::NullBuffer, compute::kernels::cast, datatypes::{DataType, Field}, }; +use std::sync::OnceLock; +use std::{fmt::Debug, sync::Arc}; use datafusion_common::{ downcast_value, not_impl_err, plan_err, DataFusionError, Result, ScalarValue, }; +use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::{ function::{AccumulatorArgs, StateFieldsArgs}, utils::format_state_name, - Accumulator, AggregateUDFImpl, GroupsAccumulator, Signature, Volatility, + Accumulator, AggregateUDFImpl, Documentation, GroupsAccumulator, Signature, + Volatility, }; use datafusion_functions_aggregate_common::{ aggregate::groups_accumulator::accumulate::accumulate, stats::StatsType, @@ -135,6 +137,26 @@ impl AggregateUDFImpl for VarianceSample { ) -> Result> { Ok(Box::new(VarianceGroupsAccumulator::new(StatsType::Sample))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_variance_sample_doc()) + } +} + +static VARIANCE_SAMPLE_DOC: OnceLock = OnceLock::new(); + +fn get_variance_sample_doc() -> &'static Documentation { + VARIANCE_SAMPLE_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the statistical sample variance of a set of numbers.", + ) + .with_syntax_example("var(expression)") + .with_standard_argument("expression", "Numeric") + .build() + .unwrap() + }) } pub struct VariancePopulation { @@ -222,6 +244,25 @@ impl AggregateUDFImpl for VariancePopulation { StatsType::Population, ))) } + fn documentation(&self) -> Option<&Documentation> { + Some(get_variance_population_doc()) + } +} + +static VARIANCE_POPULATION_DOC: OnceLock = OnceLock::new(); + +fn get_variance_population_doc() -> &'static Documentation { + VARIANCE_POPULATION_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_GENERAL) + .with_description( + "Returns the statistical population variance of a set of numbers.", + ) + .with_syntax_example("var_pop(expression)") + .with_standard_argument("expression", "Numeric") + .build() + .unwrap() + }) } /// An accumulator to compute variance diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md index edb0e1d0c9f0..fe6a61e74e62 100644 --- a/docs/source/user-guide/sql/aggregate_functions.md +++ b/docs/source/user-guide/sql/aggregate_functions.md @@ -240,9 +240,6 @@ last_value(expression [ORDER BY expression]) - [stddev](#stddev) - [stddev_pop](#stddev_pop) - [stddev_samp](#stddev_samp) -- [var](#var) -- [var_pop](#var_pop) -- [var_samp](#var_samp) - [regr_avgx](#regr_avgx) - [regr_avgy](#regr_avgy) - [regr_count](#regr_count) @@ -349,45 +346,6 @@ stddev_samp(expression) #### Arguments -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `var` - -Returns the statistical variance of a set of numbers. - -``` -var(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `var_pop` - -Returns the statistical population variance of a set of numbers. - -``` -var_pop(expression) -``` - -#### Arguments - -- **expression**: Expression to operate on. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `var_samp` - -Returns the statistical sample variance of a set of numbers. - -``` -var_samp(expression) -``` - -#### Arguments - - **expression**: Expression to operate on. Can be a constant, column, or function, and any combination of arithmetic operators. diff --git a/docs/source/user-guide/sql/aggregate_functions_new.md b/docs/source/user-guide/sql/aggregate_functions_new.md index 2c8ebc3be990..213894d7da06 100644 --- a/docs/source/user-guide/sql/aggregate_functions_new.md +++ b/docs/source/user-guide/sql/aggregate_functions_new.md @@ -36,6 +36,11 @@ Aggregate functions operate on a set of values to compute a single result. - [bit_and](#bit_and) - [bit_or](#bit_or) - [bit_xor](#bit_xor) +- [var](#var) +- [var_pop](#var_pop) +- [var_population](#var_population) +- [var_samp](#var_samp) +- [var_sample](#var_sample) ### `bit_and` @@ -72,3 +77,79 @@ bit_xor(expression) #### Arguments - **expression**: Integer expression to operate on. Can be a constant, column, or function, and any combination of operators. + +### `var` + +Returns the statistical sample variance of a set of numbers. + +``` +var(expression) +``` + +#### Arguments + +- **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Aliases- var_sample + +- var_samp + +### `var_pop` + +Returns the statistical population variance of a set of numbers. + +``` +var_pop(expression) +``` + +#### Arguments + +- **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Aliases- var_population + +### `var_pop` + +Returns the statistical population variance of a set of numbers. + +``` +var_pop(expression) +``` + +#### Arguments + +- **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Aliases- var_population + +### `var` + +Returns the statistical sample variance of a set of numbers. + +``` +var(expression) +``` + +#### Arguments + +- **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Aliases- var_sample + +- var_samp + +### `var` + +Returns the statistical sample variance of a set of numbers. + +``` +var(expression) +``` + +#### Arguments + +- **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Aliases- var_sample + +- var_samp From 9d8f77df79ad6515f216ab9a825a04d862bbdd4f Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 7 Oct 2024 08:48:37 -0400 Subject: [PATCH 17/22] Upgrade arrow/parquet to `53.1.0` / fix clippy (#12724) * Update to arrow/parquet 53.1.0 * Update some API * update for changed file sizes * Use non deprecated APIs * Use ParquetMetadataReader from @etseidl * remove upstreamed implementation * Update CSV schema * Use upstream is_null and is_not_null kernels --- Cargo.toml | 18 +- datafusion-cli/Cargo.lock | 289 +++++++++--------- .../core/src/datasource/file_format/csv.rs | 4 +- .../src/datasource/file_format/parquet.rs | 97 +++--- datafusion/functions/src/lib.rs | 3 - datafusion/functions/src/regex/regexplike.rs | 7 +- datafusion/functions/src/regexp_common.rs | 123 -------- datafusion/functions/src/string/contains.rs | 20 +- .../physical-expr/src/expressions/binary.rs | 8 +- .../src/expressions/is_not_null.rs | 2 +- .../physical-expr/src/expressions/is_null.rs | 77 +---- .../join_disable_repartition_joins.slt.temp | 26 -- .../test_files/repartition_scan.slt | 8 +- 13 files changed, 221 insertions(+), 461 deletions(-) delete mode 100644 datafusion/functions/src/regexp_common.rs delete mode 100644 datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt.temp diff --git a/Cargo.toml b/Cargo.toml index b8bf83a5ab53..448607257ca1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -70,22 +70,22 @@ version = "42.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -arrow = { version = "53.0.0", features = [ +arrow = { version = "53.1.0", features = [ "prettyprint", ] } -arrow-array = { version = "53.0.0", default-features = false, features = [ +arrow-array = { version = "53.1.0", default-features = false, features = [ "chrono-tz", ] } -arrow-buffer = { version = "53.0.0", default-features = false } -arrow-flight = { version = "53.0.0", features = [ +arrow-buffer = { version = "53.1.0", default-features = false } +arrow-flight = { version = "53.1.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "53.0.0", default-features = false, features = [ +arrow-ipc = { version = "53.1.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "53.0.0", default-features = false } -arrow-schema = { version = "53.0.0", default-features = false } -arrow-string = { version = "53.0.0", default-features = false } +arrow-ord = { version = "53.1.0", default-features = false } +arrow-schema = { version = "53.1.0", default-features = false } +arrow-string = { version = "53.1.0", default-features = false } async-trait = "0.1.73" bigdecimal = "=0.4.1" bytes = "1.4" @@ -126,7 +126,7 @@ log = "^0.4" num_cpus = "1.13.0" object_store = { version = "0.11.0", default-features = false } parking_lot = "0.12" -parquet = { version = "53.0.0", default-features = false, features = [ +parquet = { version = "53.1.0", default-features = false, features = [ "arrow", "async", "object_store", diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 8c77ea8a2557..8a6ccacbb380 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5fb1d8e4442bd405fdfd1dacb42792696b0cf9cb15882e5d097b742a676d375" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" dependencies = [ "gimli", ] @@ -173,9 +173,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45aef0d9cf9a039bf6cd1acc451b137aca819977b0928dece52bd92811b640ba" +checksum = "a9ba0d7248932f4e2a12fb37f0a2e3ec82b3bdedbac2a1dce186e036843b8f8c" dependencies = [ "arrow-arith", "arrow-array", @@ -194,9 +194,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03675e42d1560790f3524800e41403b40d0da1c793fe9528929fde06d8c7649a" +checksum = "d60afcdc004841a5c8d8da4f4fa22d64eb19c0c01ef4bcedd77f175a7cf6e38f" dependencies = [ "arrow-array", "arrow-buffer", @@ -209,9 +209,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd2bf348cf9f02a5975c5962c7fa6dee107a2009a7b41ac5fb1a027e12dc033f" +checksum = "7f16835e8599dbbb1659fd869d865254c4cf32c6c2bb60b6942ac9fc36bfa5da" dependencies = [ "ahash", "arrow-buffer", @@ -220,15 +220,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown", + "hashbrown 0.14.5", "num", ] [[package]] name = "arrow-buffer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3092e37715f168976012ce52273c3989b5793b0db5f06cbaa246be25e5f0924d" +checksum = "1a1f34f0faae77da6b142db61deba2cb6d60167592b178be317b341440acba80" dependencies = [ "bytes", "half", @@ -237,9 +237,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ce1018bb710d502f9db06af026ed3561552e493e989a79d0d0f5d9cf267a785" +checksum = "450e4abb5775bca0740bec0bcf1b1a5ae07eff43bd625661c4436d8e8e4540c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -258,9 +258,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd178575f45624d045e4ebee714e246a05d9652e41363ee3f57ec18cca97f740" +checksum = "d3a4e4d63830a341713e35d9a42452fbc6241d5f42fa5cf6a4681b8ad91370c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -277,9 +277,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e4ac0c4ee79150afe067dc4857154b3ee9c1cd52b5f40d59a77306d0ed18d65" +checksum = "2b1e618bbf714c7a9e8d97203c806734f012ff71ae3adc8ad1b075689f540634" dependencies = [ "arrow-buffer", "arrow-schema", @@ -289,9 +289,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb307482348a1267f91b0912e962cd53440e5de0f7fb24c5f7b10da70b38c94a" +checksum = "f98e983549259a2b97049af7edfb8f28b8911682040e99a94e4ceb1196bd65c2" dependencies = [ "arrow-array", "arrow-buffer", @@ -304,9 +304,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d24805ba326758effdd6f2cbdd482fcfab749544f21b134701add25b33f474e6" +checksum = "b198b9c6fcf086501730efbbcb483317b39330a116125af7bb06467d04b352a3" dependencies = [ "arrow-array", "arrow-buffer", @@ -324,9 +324,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "644046c479d80ae8ed02a7f1e1399072ea344ca6a7b0e293ab2d5d9ed924aa3b" +checksum = "2427f37b4459a4b9e533045abe87a5183a5e0995a3fc2c2fd45027ae2cc4ef3f" dependencies = [ "arrow-array", "arrow-buffer", @@ -339,9 +339,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a29791f8eb13b340ce35525b723f5f0df17ecb955599e11f65c2a94ab34e2efb" +checksum = "15959657d92e2261a7a323517640af87f5afd9fd8a6492e424ebee2203c567f6" dependencies = [ "ahash", "arrow-array", @@ -353,15 +353,15 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c85320a3a2facf2b2822b57aa9d6d9d55edb8aee0b6b5d3b8df158e503d10858" +checksum = "fbf0388a18fd7f7f3fe3de01852d30f54ed5182f9004db700fbe3ba843ed2794" [[package]] name = "arrow-select" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cc7e6b582e23855fd1625ce46e51647aa440c20ea2e71b1d748e0839dd73cba" +checksum = "b83e5723d307a38bf00ecd2972cd078d1339c7fd3eb044f609958a9a24463f3a" dependencies = [ "ahash", "arrow-array", @@ -373,9 +373,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0775b6567c66e56ded19b87a954b6b1beffbdd784ef95a3a2b03f59570c1d230" +checksum = "7ab3db7c09dd826e74079661d84ed01ed06547cf75d52c2818ef776d0d852305" dependencies = [ "arrow-array", "arrow-buffer", @@ -406,9 +406,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.12" +version = "0.4.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fec134f64e2bc57411226dfc4e52dec859ddfc7e711fc5e07b612584f000e4aa" +checksum = "7e614738943d3f68c628ae3dbce7c3daffb196665f82f8c8ea6b65de73c79429" dependencies = [ "bzip2", "flate2", @@ -456,9 +456,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-config" -version = "1.5.7" +version = "1.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8191fb3091fa0561d1379ef80333c3c7191c6f0435d986e85821bcf7acbd1126" +checksum = "7198e6f03240fdceba36656d8be440297b6b82270325908c7381f37d826a74f6" dependencies = [ "aws-credential-types", "aws-runtime", @@ -523,9 +523,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.44.0" +version = "1.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b90cfe6504115e13c41d3ea90286ede5aa14da294f3fe077027a6e83850843c" +checksum = "e33ae899566f3d395cbf42858e433930682cc9c1889fa89318896082fef45efb" dependencies = [ "aws-credential-types", "aws-runtime", @@ -545,9 +545,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.45.0" +version = "1.46.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "167c0fad1f212952084137308359e8e4c4724d1c643038ce163f06de9662c1d0" +checksum = "f39c09e199ebd96b9f860b0fce4b6625f211e064ad7c8693b72ecf7ef03881e0" dependencies = [ "aws-credential-types", "aws-runtime", @@ -567,9 +567,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.44.0" +version = "1.45.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cb5f98188ec1435b68097daa2a37d74b9d17c9caa799466338a8d1544e71b9d" +checksum = "3d95f93a98130389eb6233b9d615249e543f6c24a68ca1f109af9ca5164a8765" dependencies = [ "aws-credential-types", "aws-runtime", @@ -917,9 +917,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.22" +version = "1.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9540e661f81799159abee814118cc139a2004b3a3aa3ea37724a1b66530b90e0" +checksum = "2e80e3b6a3ab07840e1cae9b0666a63970dc28e8ed5ffbcdacbfc760c281bfc1" dependencies = [ "jobserver", "libc", @@ -953,9 +953,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93698b29de5e97ad0ae26447b344c482a7284c737d9ddc5f9e52b74a336671bb" +checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" dependencies = [ "chrono", "chrono-tz-build", @@ -964,20 +964,19 @@ dependencies = [ [[package]] name = "chrono-tz-build" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c088aee841df9c3041febbb73934cfc39708749bf96dc827e3359cd39ef11b1" +checksum = "e94fea34d77a245229e7746bd2beb786cd2a896f306ff491fb8cecb3074b10a7" dependencies = [ "parse-zoneinfo", - "phf", "phf_codegen", ] [[package]] name = "clap" -version = "4.5.18" +version = "4.5.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0956a43b323ac1afaffc053ed5c4b7c1f1800bacd1683c353aabbb752515dd3" +checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615" dependencies = [ "clap_builder", "clap_derive", @@ -985,9 +984,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.18" +version = "4.5.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d72166dd41634086d5803a47eb71ae740e61d84709c36f3c34110173db3961b" +checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b" dependencies = [ "anstream", "anstyle", @@ -1175,7 +1174,7 @@ checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", "crossbeam-utils", - "hashbrown", + "hashbrown 0.14.5", "lock_api", "once_cell", "parking_lot_core", @@ -1216,7 +1215,7 @@ dependencies = [ "futures", "glob", "half", - "hashbrown", + "hashbrown 0.14.5", "indexmap", "itertools", "log", @@ -1293,7 +1292,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "hashbrown", + "hashbrown 0.14.5", "instant", "libc", "num_cpus", @@ -1322,7 +1321,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "futures", - "hashbrown", + "hashbrown 0.14.5", "log", "object_store", "parking_lot", @@ -1375,7 +1374,7 @@ dependencies = [ "datafusion-common", "datafusion-execution", "datafusion-expr", - "hashbrown", + "hashbrown 0.14.5", "hex", "itertools", "log", @@ -1468,7 +1467,7 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown", + "hashbrown 0.14.5", "indexmap", "itertools", "log", @@ -1496,7 +1495,7 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-physical-expr-common", "half", - "hashbrown", + "hashbrown 0.14.5", "hex", "indexmap", "itertools", @@ -1514,7 +1513,7 @@ dependencies = [ "arrow", "datafusion-common", "datafusion-expr-common", - "hashbrown", + "hashbrown 0.14.5", "rand", ] @@ -1553,7 +1552,7 @@ dependencies = [ "datafusion-physical-expr-common", "futures", "half", - "hashbrown", + "hashbrown 0.14.5", "indexmap", "itertools", "log", @@ -1758,9 +1757,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" dependencies = [ "futures-channel", "futures-core", @@ -1773,9 +1772,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -1783,15 +1782,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" dependencies = [ "futures-core", "futures-task", @@ -1800,15 +1799,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", @@ -1817,15 +1816,15 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-timer" @@ -1835,9 +1834,9 @@ checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -1874,9 +1873,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.31.0" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32085ea23f3234fc7846555e85283ba4de91e21016dc0455a16286d87a292d64" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] name = "glob" @@ -1943,6 +1942,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" + [[package]] name = "heck" version = "0.4.1" @@ -2043,9 +2048,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.9.4" +version = "1.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" +checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" [[package]] name = "httpdate" @@ -2129,7 +2134,7 @@ dependencies = [ "http 1.1.0", "hyper 1.4.1", "hyper-util", - "rustls 0.23.13", + "rustls 0.23.14", "rustls-native-certs 0.8.0", "rustls-pki-types", "tokio", @@ -2191,12 +2196,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.15.0", ] [[package]] @@ -2219,9 +2224,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.10.0" +version = "2.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "187674a687eed5fe42285b40c6291f9a01517d415fad1c3cbc6a9f778af7fcd4" +checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" [[package]] name = "is_terminal_polyfill" @@ -2270,9 +2275,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "lexical-core" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -2283,9 +2288,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -2294,9 +2299,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "0.8.6" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" dependencies = [ "lexical-util", "static_assertions", @@ -2304,18 +2309,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "0.8.5" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" dependencies = [ "lexical-util", "lexical-write-integer", @@ -2324,9 +2329,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "0.8.5" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" dependencies = [ "lexical-util", "static_assertions", @@ -2358,7 +2363,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" dependencies = [ "core2", - "hashbrown", + "hashbrown 0.14.5", "rle-decode-fast", ] @@ -2601,9 +2606,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.4" +version = "0.36.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" +checksum = "aedf0a2d09c573ed1d8d85b30c119153926a2b36dce0ab28322c09a117a4683e" dependencies = [ "memchr", ] @@ -2629,7 +2634,7 @@ dependencies = [ "rand", "reqwest", "ring", - "rustls-pemfile 2.1.3", + "rustls-pemfile 2.2.0", "serde", "serde_json", "snafu", @@ -2641,9 +2646,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.19.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "openssl-probe" @@ -2697,9 +2702,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8" +checksum = "310c46a70a3ba90d98fec39fa2da6d9d731e544191da6fb56c9d199484d0dd3e" dependencies = [ "ahash", "arrow-array", @@ -2716,7 +2721,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown", + "hashbrown 0.14.5", "lz4_flex", "num", "num-bigint", @@ -2908,7 +2913,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.13", + "rustls 0.23.14", "socket2", "thiserror", "tokio", @@ -2925,7 +2930,7 @@ dependencies = [ "rand", "ring", "rustc-hash", - "rustls 0.23.13", + "rustls 0.23.14", "slab", "thiserror", "tinyvec", @@ -2996,9 +3001,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.6" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "355ae415ccd3a04315d3f8246e86d67689ea74d88d915576e1589a351062a13b" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" dependencies = [ "bitflags 2.6.0", ] @@ -3016,9 +3021,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.6" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" +checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" dependencies = [ "aho-corasick", "memchr", @@ -3028,9 +3033,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +checksum = "368758f23274712b504848e9d5a6f010445cc8b87a7cdb4d7cbee666c1288da3" dependencies = [ "aho-corasick", "memchr", @@ -3045,9 +3050,9 @@ checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "relative-path" @@ -3057,9 +3062,9 @@ checksum = "ba39f3699c378cd8970968dcbff9c43159ea4cfbd88d43c00b22f2ef10a435d2" [[package]] name = "reqwest" -version = "0.12.7" +version = "0.12.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" +checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b" dependencies = [ "base64 0.22.1", "bytes", @@ -3080,9 +3085,9 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.13", - "rustls-native-certs 0.7.3", - "rustls-pemfile 2.1.3", + "rustls 0.23.14", + "rustls-native-certs 0.8.0", + "rustls-pemfile 2.2.0", "rustls-pki-types", "serde", "serde_json", @@ -3199,9 +3204,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.13" +version = "0.23.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2dabaac7466917e566adb06783a81ca48944c6898a1b08b9374106dd671f4c8" +checksum = "415d9944693cb90382053259f89fbb077ea730ad7273047ec63b19bc9b160ba8" dependencies = [ "once_cell", "ring", @@ -3223,19 +3228,6 @@ dependencies = [ "security-framework", ] -[[package]] -name = "rustls-native-certs" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" -dependencies = [ - "openssl-probe", - "rustls-pemfile 2.1.3", - "rustls-pki-types", - "schannel", - "security-framework", -] - [[package]] name = "rustls-native-certs" version = "0.8.0" @@ -3243,7 +3235,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" dependencies = [ "openssl-probe", - "rustls-pemfile 2.1.3", + "rustls-pemfile 2.2.0", "rustls-pki-types", "schannel", "security-framework", @@ -3260,11 +3252,10 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.1.3" +version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" dependencies = [ - "base64 0.22.1", "rustls-pki-types", ] @@ -3340,9 +3331,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.24" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9aaafd5a2b6e3d657ff009d82fbd630b6bd54dd4eb06f21693925cdf80f9b8b" +checksum = "01227be5826fa0690321a2ba6c5cd57a19cf3f6a09e76973b58e61de6ab9d1c1" dependencies = [ "windows-sys 0.59.0", ] @@ -3781,7 +3772,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.13", + "rustls 0.23.14", "rustls-pki-types", "tokio", ] @@ -3897,9 +3888,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-bidi" -version = "0.3.15" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" +checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" [[package]] name = "unicode-ident" diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index e821fa806fce..f235c3b628a0 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -771,7 +771,7 @@ mod tests { "c7: Int64", "c8: Int64", "c9: Int64", - "c10: Int64", + "c10: Utf8", "c11: Float64", "c12: Float64", "c13: Utf8" @@ -907,7 +907,7 @@ mod tests { Field::new("c7", DataType::Int64, true), Field::new("c8", DataType::Int64, true), Field::new("c9", DataType::Int64, true), - Field::new("c10", DataType::Int64, true), + Field::new("c10", DataType::Utf8, true), Field::new("c11", DataType::Float64, true), Field::new("c12", DataType::Float64, true), Field::new("c13", DataType::Utf8, true), diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 98ae0ce14bd7..8647b5df90be 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -20,6 +20,7 @@ use std::any::Any; use std::fmt; use std::fmt::Debug; +use std::ops::Range; use std::sync::Arc; use super::write::demux::start_demuxer_task; @@ -47,7 +48,7 @@ use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::stats::Precision; use datafusion_common::{ - exec_err, internal_datafusion_err, not_impl_err, DataFusionError, GetExt, + internal_datafusion_err, not_impl_err, DataFusionError, GetExt, DEFAULT_PARQUET_EXTENSION, }; use datafusion_common_runtime::SpawnedTask; @@ -60,7 +61,7 @@ use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::metrics::MetricsSet; use async_trait::async_trait; -use bytes::{BufMut, BytesMut}; +use bytes::Bytes; use hashbrown::HashMap; use log::debug; use object_store::buffered::BufWriter; @@ -71,8 +72,7 @@ use parquet::arrow::arrow_writer::{ use parquet::arrow::{ arrow_to_parquet_schema, parquet_to_arrow_schema, AsyncArrowWriter, }; -use parquet::file::footer::{decode_footer, decode_metadata}; -use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData}; +use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData}; use parquet::file::properties::WriterProperties; use parquet::file::writer::SerializedFileWriter; use parquet::format::FileMetaData; @@ -84,10 +84,13 @@ use crate::datasource::physical_plan::parquet::{ can_expr_be_pushed_down_with_schemas, ParquetExecBuilder, }; use datafusion_physical_expr_common::sort_expr::LexRequirement; -use futures::{StreamExt, TryStreamExt}; +use futures::future::BoxFuture; +use futures::{FutureExt, StreamExt, TryStreamExt}; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; use parquet::arrow::arrow_reader::statistics::StatisticsConverter; +use parquet::arrow::async_reader::MetadataFetch; +use parquet::errors::ParquetError; /// Initial writing buffer size. Note this is just a size hint for efficiency. It /// will grow beyond the set value if needed. @@ -441,6 +444,33 @@ impl FileFormat for ParquetFormat { } } +/// [`MetadataFetch`] adapter for reading bytes from an [`ObjectStore`] +struct ObjectStoreFetch<'a> { + store: &'a dyn ObjectStore, + meta: &'a ObjectMeta, +} + +impl<'a> ObjectStoreFetch<'a> { + fn new(store: &'a dyn ObjectStore, meta: &'a ObjectMeta) -> Self { + Self { store, meta } + } +} + +impl<'a> MetadataFetch for ObjectStoreFetch<'a> { + fn fetch( + &mut self, + range: Range, + ) -> BoxFuture<'_, Result> { + async { + self.store + .get_range(&self.meta.location, range) + .await + .map_err(ParquetError::from) + } + .boxed() + } +} + /// Fetches parquet metadata from ObjectStore for given object /// /// This component is a subject to **change** in near future and is exposed for low level integrations @@ -452,57 +482,14 @@ pub async fn fetch_parquet_metadata( meta: &ObjectMeta, size_hint: Option, ) -> Result { - if meta.size < 8 { - return exec_err!("file size of {} is less than footer", meta.size); - } - - // If a size hint is provided, read more than the minimum size - // to try and avoid a second fetch. - let footer_start = if let Some(size_hint) = size_hint { - meta.size.saturating_sub(size_hint) - } else { - meta.size - 8 - }; - - let suffix = store - .get_range(&meta.location, footer_start..meta.size) - .await?; - - let suffix_len = suffix.len(); - - let mut footer = [0; 8]; - footer.copy_from_slice(&suffix[suffix_len - 8..suffix_len]); - - let length = decode_footer(&footer)?; + let file_size = meta.size; + let fetch = ObjectStoreFetch::new(store, meta); - if meta.size < length + 8 { - return exec_err!( - "file size of {} is less than footer + metadata {}", - meta.size, - length + 8 - ); - } - - // Did not fetch the entire file metadata in the initial read, need to make a second request - if length > suffix_len - 8 { - let metadata_start = meta.size - length - 8; - let remaining_metadata = store - .get_range(&meta.location, metadata_start..footer_start) - .await?; - - let mut metadata = BytesMut::with_capacity(length); - - metadata.put(remaining_metadata.as_ref()); - metadata.put(&suffix[..suffix_len - 8]); - - Ok(decode_metadata(metadata.as_ref())?) - } else { - let metadata_start = meta.size - length - 8; - - Ok(decode_metadata( - &suffix[metadata_start - footer_start..suffix_len - 8], - )?) - } + ParquetMetaDataReader::new() + .with_prefetch_hint(size_hint) + .load_and_finish(fetch, file_size) + .await + .map_err(DataFusionError::from) } /// Read and parse the schema of the Parquet file at location `path` diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs index bb680f3c67de..81be5552666d 100644 --- a/datafusion/functions/src/lib.rs +++ b/datafusion/functions/src/lib.rs @@ -92,9 +92,6 @@ pub mod macros; pub mod string; make_stub_package!(string, "string_expressions"); -#[cfg(feature = "string_expressions")] -mod regexp_common; - /// Core datafusion expressions /// Enabled via feature flag `core_expressions` #[cfg(feature = "core_expressions")] diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index f647b1969168..e245ea9fa72f 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -16,7 +16,7 @@ // under the License. //! Regx expressions -use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; +use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait}; use arrow::compute::kernels::regexp; use arrow::datatypes::DataType; use datafusion_common::exec_err; @@ -206,7 +206,8 @@ pub fn regexp_like(args: &[ArrayRef]) -> Result { 2 => { let values = as_generic_string_array::(&args[0])?; let regex = as_generic_string_array::(&args[1])?; - let array = regexp::regexp_is_match_utf8(values, regex, None) + let flags: Option<&GenericStringArray> = None; + let array = regexp::regexp_is_match(values, regex, flags) .map_err(|e| arrow_datafusion_err!(e))?; Ok(Arc::new(array) as ArrayRef) @@ -220,7 +221,7 @@ pub fn regexp_like(args: &[ArrayRef]) -> Result { return plan_err!("regexp_like() does not support the \"global\" option"); } - let array = regexp::regexp_is_match_utf8(values, regex, Some(flags)) + let array = regexp::regexp_is_match(values, regex, Some(flags)) .map_err(|e| arrow_datafusion_err!(e))?; Ok(Arc::new(array) as ArrayRef) diff --git a/datafusion/functions/src/regexp_common.rs b/datafusion/functions/src/regexp_common.rs deleted file mode 100644 index 748c1a294f97..000000000000 --- a/datafusion/functions/src/regexp_common.rs +++ /dev/null @@ -1,123 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Common utilities for implementing regex functions - -use crate::string::common::StringArrayType; - -use arrow::array::{Array, ArrayDataBuilder, BooleanArray}; -use arrow::datatypes::DataType; -use arrow_buffer::{BooleanBufferBuilder, NullBuffer}; -use datafusion_common::DataFusionError; -use regex::Regex; - -use std::collections::HashMap; - -#[cfg(doc)] -use arrow::array::{LargeStringArray, StringArray, StringViewArray}; -/// Perform SQL `array ~ regex_array` operation on -/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`]. -/// -/// If `regex_array` element has an empty value, the corresponding result value is always true. -/// -/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag, -/// which allow special search modes, such as case-insensitive and multi-line mode. -/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags) -/// for more information. -/// -/// It is inspired / copied from `regexp_is_match_utf8` [arrow-rs]. -/// -/// Can remove when is implemented upstream -/// -/// [arrow-rs]: https://github.com/apache/arrow-rs/blob/8c956a9f9ab26c14072740cce64c2b99cb039b13/arrow-string/src/regexp.rs#L31-L37 -pub fn regexp_is_match_utf8<'a, S1, S2, S3>( - array: &'a S1, - regex_array: &'a S2, - flags_array: Option<&'a S3>, -) -> datafusion_common::Result -where - &'a S1: StringArrayType<'a>, - &'a S2: StringArrayType<'a>, - &'a S3: StringArrayType<'a>, -{ - if array.len() != regex_array.len() { - return Err(DataFusionError::Execution( - "Cannot perform comparison operation on arrays of different length" - .to_string(), - )); - } - - let nulls = NullBuffer::union(array.nulls(), regex_array.nulls()); - - let mut patterns: HashMap = HashMap::new(); - let mut result = BooleanBufferBuilder::new(array.len()); - - let complete_pattern = match flags_array { - Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map( - |(pattern, flags)| { - pattern.map(|pattern| match flags { - Some(flag) => format!("(?{flag}){pattern}"), - None => pattern.to_string(), - }) - }, - )) as Box>>, - None => Box::new( - regex_array - .iter() - .map(|pattern| pattern.map(|pattern| pattern.to_string())), - ), - }; - - array - .iter() - .zip(complete_pattern) - .map(|(value, pattern)| { - match (value, pattern) { - (Some(_), Some(pattern)) if pattern == *"" => { - result.append(true); - } - (Some(value), Some(pattern)) => { - let existing_pattern = patterns.get(&pattern); - let re = match existing_pattern { - Some(re) => re, - None => { - let re = Regex::new(pattern.as_str()).map_err(|e| { - DataFusionError::Execution(format!( - "Regular expression did not compile: {e:?}" - )) - })?; - patterns.entry(pattern).or_insert(re) - } - }; - result.append(re.is_match(value)); - } - _ => result.append(false), - } - Ok(()) - }) - .collect::, DataFusionError>>()?; - - let data = unsafe { - ArrayDataBuilder::new(DataType::Boolean) - .len(array.len()) - .buffers(vec![result.into()]) - .nulls(nulls) - .build_unchecked() - }; - - Ok(BooleanArray::from(data)) -} diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs index c319f80661c3..722451ab5344 100644 --- a/datafusion/functions/src/string/contains.rs +++ b/datafusion/functions/src/string/contains.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -use crate::regexp_common::regexp_is_match_utf8; use crate::utils::make_scalar_function; use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray, StringViewArray}; @@ -28,6 +27,7 @@ use datafusion_expr::ScalarUDFImpl; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, Signature, Volatility}; +use arrow::compute::regexp_is_match; use std::any::Any; use std::sync::Arc; @@ -92,7 +92,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8View, Utf8View) => { let mod_str = args[0].as_string_view(); let match_str = args[1].as_string_view(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< StringViewArray, StringViewArray, GenericStringArray, @@ -103,7 +103,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8View, Utf8) => { let mod_str = args[0].as_string_view(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< StringViewArray, GenericStringArray, GenericStringArray, @@ -114,7 +114,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8View, LargeUtf8) => { let mod_str = args[0].as_string_view(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< StringViewArray, GenericStringArray, GenericStringArray, @@ -125,7 +125,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8, Utf8View) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string_view(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, StringViewArray, GenericStringArray, @@ -136,7 +136,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8, Utf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, GenericStringArray, GenericStringArray, @@ -147,7 +147,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (Utf8, LargeUtf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, GenericStringArray, GenericStringArray, @@ -158,7 +158,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (LargeUtf8, Utf8View) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string_view(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, StringViewArray, GenericStringArray, @@ -169,7 +169,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (LargeUtf8, Utf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, GenericStringArray, GenericStringArray, @@ -180,7 +180,7 @@ pub fn contains(args: &[ArrayRef]) -> Result { (LargeUtf8, LargeUtf8) => { let mod_str = args[0].as_string::(); let match_str = args[1].as_string::(); - let res = regexp_is_match_utf8::< + let res = regexp_is_match::< GenericStringArray, GenericStringArray, GenericStringArray, diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 236b24dd4094..3d9072c2e14f 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -27,9 +27,7 @@ use crate::PhysicalExpr; use arrow::array::*; use arrow::compute::kernels::boolean::{and_kleene, not, or_kleene}; use arrow::compute::kernels::cmp::*; -use arrow::compute::kernels::comparison::{ - regexp_is_match_utf8, regexp_is_match_utf8_scalar, -}; +use arrow::compute::kernels::comparison::{regexp_is_match, regexp_is_match_scalar}; use arrow::compute::kernels::concat_elements::concat_elements_utf8; use arrow::compute::{cast, ilike, like, nilike, nlike}; use arrow::datatypes::*; @@ -179,7 +177,7 @@ macro_rules! compute_utf8_flag_op { } else { None }; - let mut array = paste::expr! {[<$OP _utf8>]}(&ll, &rr, flag.as_ref())?; + let mut array = $OP(ll, rr, flag.as_ref())?; if $NOT { array = not(&array).unwrap(); } @@ -216,7 +214,7 @@ macro_rules! compute_utf8_flag_op_scalar { if let ScalarValue::Utf8(Some(string_value)) | ScalarValue::LargeUtf8(Some(string_value)) = $RIGHT { let flag = $FLAG.then_some("i"); let mut array = - paste::expr! {[<$OP _utf8_scalar>]}(&ll, &string_value, flag)?; + paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?; if $NOT { array = not(&array).unwrap(); } diff --git a/datafusion/physical-expr/src/expressions/is_not_null.rs b/datafusion/physical-expr/src/expressions/is_not_null.rs index 58559352d44c..cbab7d0c9d1f 100644 --- a/datafusion/physical-expr/src/expressions/is_not_null.rs +++ b/datafusion/physical-expr/src/expressions/is_not_null.rs @@ -73,7 +73,7 @@ impl PhysicalExpr for IsNotNullExpr { let arg = self.arg.evaluate(batch)?; match arg { ColumnarValue::Array(array) => { - let is_not_null = super::is_null::compute_is_not_null(array)?; + let is_not_null = arrow::compute::is_not_null(&array)?; Ok(ColumnarValue::Array(Arc::new(is_not_null))) } ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar( diff --git a/datafusion/physical-expr/src/expressions/is_null.rs b/datafusion/physical-expr/src/expressions/is_null.rs index 3cdb49bcab42..1c8597d3fdea 100644 --- a/datafusion/physical-expr/src/expressions/is_null.rs +++ b/datafusion/physical-expr/src/expressions/is_null.rs @@ -20,14 +20,10 @@ use std::hash::{Hash, Hasher}; use std::{any::Any, sync::Arc}; -use arrow::compute; use arrow::{ datatypes::{DataType, Schema}, record_batch::RecordBatch, }; -use arrow_array::{Array, ArrayRef, BooleanArray, Int8Array, UnionArray}; -use arrow_buffer::{BooleanBuffer, ScalarBuffer}; -use arrow_ord::cmp; use crate::physical_expr::down_cast_any_ref; use crate::PhysicalExpr; @@ -77,9 +73,9 @@ impl PhysicalExpr for IsNullExpr { fn evaluate(&self, batch: &RecordBatch) -> Result { let arg = self.arg.evaluate(batch)?; match arg { - ColumnarValue::Array(array) => { - Ok(ColumnarValue::Array(Arc::new(compute_is_null(array)?))) - } + ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new( + arrow::compute::is_null(&array)?, + ))), ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar( ScalarValue::Boolean(Some(scalar.is_null())), )), @@ -103,65 +99,6 @@ impl PhysicalExpr for IsNullExpr { } } -/// workaround , -/// this can be replaced with a direct call to `arrow::compute::is_null` once it's fixed. -pub(crate) fn compute_is_null(array: ArrayRef) -> Result { - if let Some(union_array) = array.as_any().downcast_ref::() { - if let Some(offsets) = union_array.offsets() { - dense_union_is_null(union_array, offsets) - } else { - sparse_union_is_null(union_array) - } - } else { - compute::is_null(array.as_ref()).map_err(Into::into) - } -} - -/// workaround , -/// this can be replaced with a direct call to `arrow::compute::is_not_null` once it's fixed. -pub(crate) fn compute_is_not_null(array: ArrayRef) -> Result { - if array.as_any().is::() { - compute::not(&compute_is_null(array)?).map_err(Into::into) - } else { - compute::is_not_null(array.as_ref()).map_err(Into::into) - } -} - -fn dense_union_is_null( - union_array: &UnionArray, - offsets: &ScalarBuffer, -) -> Result { - let child_arrays = (0..union_array.type_names().len()) - .map(|type_id| { - compute::is_null(&union_array.child(type_id as i8)).map_err(Into::into) - }) - .collect::>>()?; - - let buffer: BooleanBuffer = offsets - .iter() - .zip(union_array.type_ids()) - .map(|(offset, type_id)| child_arrays[*type_id as usize].value(*offset as usize)) - .collect(); - - Ok(BooleanArray::new(buffer, None)) -} - -fn sparse_union_is_null(union_array: &UnionArray) -> Result { - let type_ids = Int8Array::new(union_array.type_ids().clone(), None); - - let mut union_is_null = - BooleanArray::new(BooleanBuffer::new_unset(union_array.len()), None); - for type_id in 0..union_array.type_names().len() { - let type_id = type_id as i8; - let union_is_child = cmp::eq(&type_ids, &Int8Array::new_scalar(type_id))?; - let child = union_array.child(type_id); - let child_array_is_null = compute::is_null(&child)?; - let child_is_null = compute::and(&union_is_child, &child_array_is_null)?; - union_is_null = compute::or(&union_is_null, &child_is_null)?; - } - Ok(union_is_null) -} - impl PartialEq for IsNullExpr { fn eq(&self, other: &dyn Any) -> bool { down_cast_any_ref(other) @@ -184,7 +121,7 @@ mod tests { array::{BooleanArray, StringArray}, datatypes::*, }; - use arrow_array::{Float64Array, Int32Array}; + use arrow_array::{Array, Float64Array, Int32Array, UnionArray}; use arrow_buffer::ScalarBuffer; use datafusion_common::cast::as_boolean_array; @@ -243,8 +180,7 @@ mod tests { let array = UnionArray::try_new(union_fields(), type_ids, None, children).unwrap(); - let array_ref = Arc::new(array) as ArrayRef; - let result = compute_is_null(array_ref).unwrap(); + let result = arrow::compute::is_null(&array).unwrap(); let expected = &BooleanArray::from(vec![false, true, false, false, true, true, false]); @@ -272,8 +208,7 @@ mod tests { UnionArray::try_new(union_fields(), type_ids, Some(offsets), children) .unwrap(); - let array_ref = Arc::new(array) as ArrayRef; - let result = compute_is_null(array_ref).unwrap(); + let result = arrow::compute::is_null(&array).unwrap(); let expected = &BooleanArray::from(vec![false, true, false, true, false, true]); assert_eq!(expected, &result); diff --git a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt.temp b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt.temp deleted file mode 100644 index 00e74a207b33..000000000000 --- a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt.temp +++ /dev/null @@ -1,26 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at - -# http://www.apache.org/licenses/LICENSE-2.0 - -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -########## -## Join Tests -########## - -# turn off repartition_joins -statement ok -set datafusion.optimizer.repartition_joins = false; - -include ./join.slt diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 4c86312f9e51..858e42106221 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -61,7 +61,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..87], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:87..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:174..261], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:261..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..88], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:88..176], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:176..264], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:264..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # disable round robin repartitioning statement ok @@ -77,7 +77,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..87], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:87..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:174..261], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:261..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..88], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:88..176], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:176..264], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:264..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # enable round robin repartitioning again statement ok @@ -102,7 +102,7 @@ physical_plan 02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=8192 04)------FilterExec: column1@0 != 42 -05)--------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..172], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:172..338, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..178], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:178..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +05)--------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:174..342, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..180], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:180..351]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] ## Read the files as though they are ordered @@ -138,7 +138,7 @@ physical_plan 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: column1@0 != 42 -04)------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..169], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..173], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:173..347], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:169..338]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +04)------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..171], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..175], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:175..351], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:171..342]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # Cleanup statement ok From 583bdc2acc5bc722e233d1f932dfc2d4de8ac3ac Mon Sep 17 00:00:00 2001 From: Weston Pace Date: Mon, 7 Oct 2024 05:53:39 -0700 Subject: [PATCH 18/22] feat: add support for Substrait ExtendedExpression (#12728) * Add support for serializing and deserializing Substrait ExtendedExpr message * Address clippy reviews * Reuse existing rename method --- .../substrait/src/logical_plan/consumer.rs | 203 ++++++++++++---- .../substrait/src/logical_plan/producer.rs | 230 ++++++++++++++---- 2 files changed, 338 insertions(+), 95 deletions(-) diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index e6bfc67eda81..030536f9f830 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -34,6 +34,7 @@ use datafusion::logical_expr::{ ExprSchemable, LogicalPlan, Operator, Projection, SortExpr, Values, }; use substrait::proto::expression::subquery::set_predicate::PredicateOp; +use substrait::proto::expression_reference::ExprType; use url::Url; use crate::extensions::Extensions; @@ -96,7 +97,7 @@ use substrait::proto::{ sort_field::{SortDirection, SortKind::*}, AggregateFunction, Expression, NamedStruct, Plan, Rel, Type, }; -use substrait::proto::{FunctionArgument, SortField}; +use substrait::proto::{ExtendedExpression, FunctionArgument, SortField}; // Substrait PrecisionTimestampTz indicates that the timestamp is relative to UTC, which // is the same as the expectation for any non-empty timezone in DF, so any non-empty timezone @@ -251,6 +252,81 @@ pub async fn from_substrait_plan( } } +/// An ExprContainer is a container for a collection of expressions with a common input schema +/// +/// In addition, each expression is associated with a field, which defines the +/// expression's output. The data type and nullability of the field are calculated from the +/// expression and the input schema. However the names of the field (and its nested fields) are +/// derived from the Substrait message. +pub struct ExprContainer { + /// The input schema for the expressions + pub input_schema: DFSchemaRef, + /// The expressions + /// + /// Each item contains an expression and the field that defines the expected nullability and name of the expr's output + pub exprs: Vec<(Expr, Field)>, +} + +/// Convert Substrait ExtendedExpression to ExprContainer +/// +/// A Substrait ExtendedExpression message contains one or more expressions, +/// with names for the outputs, and an input schema. These pieces are all included +/// in the ExprContainer. +/// +/// This is a top-level message and can be used to send expressions (not plans) +/// between systems. This is often useful for scenarios like pushdown where filter +/// expressions need to be sent to remote systems. +pub async fn from_substrait_extended_expr( + ctx: &SessionContext, + extended_expr: &ExtendedExpression, +) -> Result { + // Register function extension + let extensions = Extensions::try_from(&extended_expr.extensions)?; + if !extensions.type_variations.is_empty() { + return not_impl_err!("Type variation extensions are not supported"); + } + + let input_schema = DFSchemaRef::new(match &extended_expr.base_schema { + Some(base_schema) => from_substrait_named_struct(base_schema, &extensions), + None => { + plan_err!("required property `base_schema` missing from Substrait ExtendedExpression message") + } + }?); + + // Parse expressions + let mut exprs = Vec::with_capacity(extended_expr.referred_expr.len()); + for (expr_idx, substrait_expr) in extended_expr.referred_expr.iter().enumerate() { + let scalar_expr = match &substrait_expr.expr_type { + Some(ExprType::Expression(scalar_expr)) => Ok(scalar_expr), + Some(ExprType::Measure(_)) => { + not_impl_err!("Measure expressions are not yet supported") + } + None => { + plan_err!("required property `expr_type` missing from Substrait ExpressionReference message") + } + }?; + let expr = + from_substrait_rex(ctx, scalar_expr, &input_schema, &extensions).await?; + let (output_type, expected_nullability) = + expr.data_type_and_nullable(&input_schema)?; + let output_field = Field::new("", output_type, expected_nullability); + let mut names_idx = 0; + let output_field = rename_field( + &output_field, + &substrait_expr.output_names, + expr_idx, + &mut names_idx, + /*rename_self=*/ true, + )?; + exprs.push((expr, output_field)); + } + + Ok(ExprContainer { + input_schema, + exprs, + }) +} + /// parse projection pub fn extract_projection( t: LogicalPlan, @@ -334,6 +410,68 @@ fn rename_expressions( .collect() } +fn rename_field( + field: &Field, + dfs_names: &Vec, + unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll use this "c{unnamed_field_suffix}" + name_idx: &mut usize, // Index into dfs_names + rename_self: bool, // Some fields (e.g. list items) don't have names in Substrait and this will be false to keep old name +) -> Result { + let name = if rename_self { + next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)? + } else { + field.name().to_string() + }; + match field.data_type() { + DataType::Struct(children) => { + let children = children + .iter() + .enumerate() + .map(|(child_idx, f)| { + rename_field( + f.as_ref(), + dfs_names, + child_idx, + name_idx, + /*rename_self=*/ true, + ) + }) + .collect::>()?; + Ok(field + .to_owned() + .with_name(name) + .with_data_type(DataType::Struct(children))) + } + DataType::List(inner) => { + let renamed_inner = rename_field( + inner.as_ref(), + dfs_names, + 0, + name_idx, + /*rename_self=*/ false, + )?; + Ok(field + .to_owned() + .with_data_type(DataType::List(FieldRef::new(renamed_inner))) + .with_name(name)) + } + DataType::LargeList(inner) => { + let renamed_inner = rename_field( + inner.as_ref(), + dfs_names, + 0, + name_idx, + /*rename_self= */ false, + )?; + Ok(field + .to_owned() + .with_data_type(DataType::LargeList(FieldRef::new(renamed_inner))) + .with_name(name)) + } + _ => Ok(field.to_owned().with_name(name)), + } +} + /// Produce a version of the given schema with names matching the given list of names. /// Substrait doesn't deal with column (incl. nested struct field) names within the schema, /// but it does give us the list of expected names at the end of the plan, so we use this @@ -342,59 +480,20 @@ fn make_renamed_schema( schema: &DFSchemaRef, dfs_names: &Vec, ) -> Result { - fn rename_inner_fields( - dtype: &DataType, - dfs_names: &Vec, - name_idx: &mut usize, - ) -> Result { - match dtype { - DataType::Struct(fields) => { - let fields = fields - .iter() - .map(|f| { - let name = next_struct_field_name(0, dfs_names, name_idx)?; - Ok((**f).to_owned().with_name(name).with_data_type( - rename_inner_fields(f.data_type(), dfs_names, name_idx)?, - )) - }) - .collect::>()?; - Ok(DataType::Struct(fields)) - } - DataType::List(inner) => Ok(DataType::List(FieldRef::new( - (**inner).to_owned().with_data_type(rename_inner_fields( - inner.data_type(), - dfs_names, - name_idx, - )?), - ))), - DataType::LargeList(inner) => Ok(DataType::LargeList(FieldRef::new( - (**inner).to_owned().with_data_type(rename_inner_fields( - inner.data_type(), - dfs_names, - name_idx, - )?), - ))), - _ => Ok(dtype.to_owned()), - } - } - let mut name_idx = 0; let (qualifiers, fields): (_, Vec) = schema .iter() - .map(|(q, f)| { - let name = next_struct_field_name(0, dfs_names, &mut name_idx)?; - Ok(( - q.cloned(), - (**f) - .to_owned() - .with_name(name) - .with_data_type(rename_inner_fields( - f.data_type(), - dfs_names, - &mut name_idx, - )?), - )) + .enumerate() + .map(|(field_idx, (q, f))| { + let renamed_f = rename_field( + f.as_ref(), + dfs_names, + field_idx, + &mut name_idx, + /*rename_self=*/ true, + )?; + Ok((q.cloned(), renamed_f)) }) .collect::>>()? .into_iter() @@ -1681,14 +1780,14 @@ fn from_substrait_struct_type( } fn next_struct_field_name( - i: usize, + column_idx: usize, dfs_names: &[String], name_idx: &mut usize, ) -> Result { if dfs_names.is_empty() { // If names are not given, create dummy names // c0, c1, ... align with e.g. SqlToRel::create_named_struct - Ok(format!("c{i}")) + Ok(format!("c{column_idx}")) } else { let name = dfs_names.get(*name_idx).cloned().ok_or_else(|| { substrait_datafusion_err!("Named schema must contain names for all fields") diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs index fada827875b0..1165ce13d236 100644 --- a/datafusion/substrait/src/logical_plan/producer.rs +++ b/datafusion/substrait/src/logical_plan/producer.rs @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. -use itertools::Itertools; use std::sync::Arc; +use substrait::proto::expression_reference::ExprType; use arrow_buffer::ToByteSlice; -use datafusion::arrow::datatypes::IntervalUnit; +use datafusion::arrow::datatypes::{Field, IntervalUnit}; use datafusion::logical_expr::{ CrossJoin, Distinct, Like, Partitioning, WindowFrameUnits, }; @@ -63,7 +63,9 @@ use substrait::proto::expression::window_function::BoundsType; use substrait::proto::read_rel::VirtualTable; use substrait::proto::rel_common::EmitKind; use substrait::proto::rel_common::EmitKind::Emit; -use substrait::proto::{rel_common, CrossRel, ExchangeRel, RelCommon}; +use substrait::proto::{ + rel_common, CrossRel, ExchangeRel, ExpressionReference, ExtendedExpression, RelCommon, +}; use substrait::{ proto::{ aggregate_function::AggregationInvocation, @@ -119,6 +121,56 @@ pub fn to_substrait_plan(plan: &LogicalPlan, ctx: &SessionContext) -> Result Result> { + let mut extensions = Extensions::default(); + + let substrait_exprs = exprs + .iter() + .map(|(expr, field)| { + let substrait_expr = to_substrait_rex( + ctx, + expr, + schema, + /*col_ref_offset=*/ 0, + &mut extensions, + )?; + let mut output_names = Vec::new(); + flatten_names(field, false, &mut output_names)?; + Ok(ExpressionReference { + output_names, + expr_type: Some(ExprType::Expression(substrait_expr)), + }) + }) + .collect::>>()?; + let substrait_schema = to_substrait_named_struct(schema, &mut extensions)?; + + Ok(Box::new(ExtendedExpression { + advanced_extensions: None, + expected_type_urls: vec![], + extension_uris: vec![], + extensions: extensions.into(), + version: Some(version::version_with_producer("datafusion")), + referred_expr: substrait_exprs, + base_schema: Some(substrait_schema), + })) +} + /// Convert DataFusion LogicalPlan to Substrait Rel pub fn to_substrait_rel( plan: &LogicalPlan, @@ -580,50 +632,43 @@ fn create_project_remapping(expr_count: usize, input_field_count: usize) -> Emit Emit(rel_common::Emit { output_mapping }) } +// Substrait wants a list of all field names, including nested fields from structs, +// also from within e.g. lists and maps. However, it does not want the list and map field names +// themselves - only proper structs fields are considered to have useful names. +fn flatten_names(field: &Field, skip_self: bool, names: &mut Vec) -> Result<()> { + if !skip_self { + names.push(field.name().to_string()); + } + match field.data_type() { + DataType::Struct(fields) => { + for field in fields { + flatten_names(field, false, names)?; + } + Ok(()) + } + DataType::List(l) => flatten_names(l, true, names), + DataType::LargeList(l) => flatten_names(l, true, names), + DataType::Map(m, _) => match m.data_type() { + DataType::Struct(key_and_value) if key_and_value.len() == 2 => { + flatten_names(&key_and_value[0], true, names)?; + flatten_names(&key_and_value[1], true, names) + } + _ => plan_err!("Map fields must contain a Struct with exactly 2 fields"), + }, + _ => Ok(()), + }?; + Ok(()) +} + fn to_substrait_named_struct( schema: &DFSchemaRef, extensions: &mut Extensions, ) -> Result { - // Substrait wants a list of all field names, including nested fields from structs, - // also from within e.g. lists and maps. However, it does not want the list and map field names - // themselves - only proper structs fields are considered to have useful names. - fn names_dfs(dtype: &DataType) -> Result> { - match dtype { - DataType::Struct(fields) => { - let mut names = Vec::new(); - for field in fields { - names.push(field.name().to_string()); - names.extend(names_dfs(field.data_type())?); - } - Ok(names) - } - DataType::List(l) => names_dfs(l.data_type()), - DataType::LargeList(l) => names_dfs(l.data_type()), - DataType::Map(m, _) => match m.data_type() { - DataType::Struct(key_and_value) if key_and_value.len() == 2 => { - let key_names = - names_dfs(key_and_value.first().unwrap().data_type())?; - let value_names = - names_dfs(key_and_value.last().unwrap().data_type())?; - Ok([key_names, value_names].concat()) - } - _ => plan_err!("Map fields must contain a Struct with exactly 2 fields"), - }, - _ => Ok(Vec::new()), - } + let mut names = Vec::with_capacity(schema.fields().len()); + for field in schema.fields() { + flatten_names(field, false, &mut names)?; } - let names = schema - .fields() - .iter() - .map(|f| { - let mut names = vec![f.name().to_string()]; - names.extend(names_dfs(f.data_type())?); - Ok(names) - }) - .flatten_ok() - .collect::>()?; - let field_types = r#type::Struct { types: schema .fields() @@ -2178,14 +2223,16 @@ fn substrait_field_ref(index: usize) -> Result { mod test { use super::*; use crate::logical_plan::consumer::{ - from_substrait_literal_without_names, from_substrait_type_without_names, + from_substrait_extended_expr, from_substrait_literal_without_names, + from_substrait_named_struct, from_substrait_type_without_names, }; use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano}; use datafusion::arrow::array::{ GenericListArray, Int64Builder, MapBuilder, StringBuilder, }; - use datafusion::arrow::datatypes::Field; + use datafusion::arrow::datatypes::{Field, Fields, Schema}; use datafusion::common::scalar::ScalarStructBuilder; + use datafusion::common::DFSchema; use std::collections::HashMap; #[test] @@ -2461,4 +2508,101 @@ mod test { Ok(()) } + + #[test] + fn named_struct_names() -> Result<()> { + let mut extensions = Extensions::default(); + let schema = DFSchemaRef::new(DFSchema::try_from(Schema::new(vec![ + Field::new("int", DataType::Int32, true), + Field::new( + "struct", + DataType::Struct(Fields::from(vec![Field::new( + "inner", + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + )])), + true, + ), + Field::new("trailer", DataType::Float64, true), + ]))?); + + let named_struct = to_substrait_named_struct(&schema, &mut extensions)?; + + // Struct field names should be flattened DFS style + // List field names should be omitted + assert_eq!( + named_struct.names, + vec!["int", "struct", "inner", "trailer"] + ); + + let roundtrip_schema = from_substrait_named_struct(&named_struct, &extensions)?; + assert_eq!(schema.as_ref(), &roundtrip_schema); + Ok(()) + } + + #[tokio::test] + async fn extended_expressions() -> Result<()> { + let ctx = SessionContext::new(); + + // One expression, empty input schema + let expr = Expr::Literal(ScalarValue::Int32(Some(42))); + let field = Field::new("out", DataType::Int32, false); + let empty_schema = DFSchemaRef::new(DFSchema::empty()); + let substrait = + to_substrait_extended_expr(&[(&expr, &field)], &empty_schema, &ctx)?; + let roundtrip_expr = from_substrait_extended_expr(&ctx, &substrait).await?; + + assert_eq!(roundtrip_expr.input_schema, empty_schema); + assert_eq!(roundtrip_expr.exprs.len(), 1); + + let (rt_expr, rt_field) = roundtrip_expr.exprs.first().unwrap(); + assert_eq!(rt_field, &field); + assert_eq!(rt_expr, &expr); + + // Multiple expressions, with column references + let expr1 = Expr::Column("c0".into()); + let expr2 = Expr::Column("c1".into()); + let out1 = Field::new("out1", DataType::Int32, true); + let out2 = Field::new("out2", DataType::Utf8, true); + let input_schema = DFSchemaRef::new(DFSchema::try_from(Schema::new(vec![ + Field::new("c0", DataType::Int32, true), + Field::new("c1", DataType::Utf8, true), + ]))?); + + let substrait = to_substrait_extended_expr( + &[(&expr1, &out1), (&expr2, &out2)], + &input_schema, + &ctx, + )?; + let roundtrip_expr = from_substrait_extended_expr(&ctx, &substrait).await?; + + assert_eq!(roundtrip_expr.input_schema, input_schema); + assert_eq!(roundtrip_expr.exprs.len(), 2); + + let mut exprs = roundtrip_expr.exprs.into_iter(); + + let (rt_expr, rt_field) = exprs.next().unwrap(); + assert_eq!(rt_field, out1); + assert_eq!(rt_expr, expr1); + + let (rt_expr, rt_field) = exprs.next().unwrap(); + assert_eq!(rt_field, out2); + assert_eq!(rt_expr, expr2); + + Ok(()) + } + + #[tokio::test] + async fn invalid_extended_expression() { + let ctx = SessionContext::new(); + + // Not ok if input schema is missing field referenced by expr + let expr = Expr::Column("missing".into()); + let field = Field::new("out", DataType::Int32, false); + let empty_schema = DFSchemaRef::new(DFSchema::empty()); + + let err = to_substrait_extended_expr(&[(&expr, &field)], &empty_schema, &ctx); + + assert!(matches!(err, Err(DataFusionError::SchemaError(_, _)))); + } } From 134939adad79d9804cd7d423aaf9a1bf8c46fcb4 Mon Sep 17 00:00:00 2001 From: Val Lorentz Date: Mon, 7 Oct 2024 16:31:24 +0200 Subject: [PATCH 19/22] Transformed::new_transformed: Fix documentation formatting (#12787) Co-authored-by: Andrew Lamb --- datafusion/common/src/tree_node.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs index 88300e3edd0e..b4d3251fd263 100644 --- a/datafusion/common/src/tree_node.rs +++ b/datafusion/common/src/tree_node.rs @@ -681,7 +681,7 @@ impl Transformed { } } - /// Create a `Transformed` with `transformed and [`TreeNodeRecursion::Continue`]. + /// Create a `Transformed` with `transformed` and [`TreeNodeRecursion::Continue`]. pub fn new_transformed(data: T, transformed: bool) -> Self { Self::new(data, transformed, TreeNodeRecursion::Continue) } From ef227f41cba69718e16557e164415d20b83a4bd6 Mon Sep 17 00:00:00 2001 From: Emil Ejbyfeldt Date: Mon, 7 Oct 2024 18:24:46 +0200 Subject: [PATCH 20/22] fix: Correct results for grouping sets when columns contain nulls (#12571) * Fix grouping sets behavior when data contains nulls * PR suggestion comment * Update new test case * Add grouping_id to the logical plan * Add doc comment next to INTERNAL_GROUPING_ID * Fix unparsing of Aggregate with grouping sets --------- Co-authored-by: Andrew Lamb --- datafusion/core/src/dataframe/mod.rs | 17 + datafusion/core/src/physical_planner.rs | 14 +- datafusion/expr/src/logical_plan/plan.rs | 56 ++- datafusion/expr/src/utils.rs | 12 +- .../src/single_distinct_to_groupby.rs | 6 +- .../physical-plan/src/aggregates/mod.rs | 370 +++++++++++------- .../physical-plan/src/aggregates/row_hash.rs | 6 +- datafusion/sql/src/unparser/utils.rs | 17 +- .../sqllogictest/test_files/aggregate.slt | 32 +- .../sqllogictest/test_files/group_by.slt | 11 +- .../tests/cases/roundtrip_logical_plan.rs | 5 +- 11 files changed, 359 insertions(+), 187 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index f5867881da13..67e2a4780d06 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -535,9 +535,26 @@ impl DataFrame { group_expr: Vec, aggr_expr: Vec, ) -> Result { + let is_grouping_set = matches!(group_expr.as_slice(), [Expr::GroupingSet(_)]); + let aggr_expr_len = aggr_expr.len(); let plan = LogicalPlanBuilder::from(self.plan) .aggregate(group_expr, aggr_expr)? .build()?; + let plan = if is_grouping_set { + let grouping_id_pos = plan.schema().fields().len() - 1 - aggr_expr_len; + // For grouping sets we do a project to not expose the internal grouping id + let exprs = plan + .schema() + .columns() + .into_iter() + .enumerate() + .filter(|(idx, _)| *idx != grouping_id_pos) + .map(|(_, column)| Expr::Column(column)) + .collect::>(); + LogicalPlanBuilder::from(plan).project(exprs)?.build()? + } else { + plan + }; Ok(DataFrame { session_state: self.session_state, plan, diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 78c70606bf68..cf2a157b04b6 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -692,10 +692,6 @@ impl DefaultPhysicalPlanner { physical_input_schema.clone(), )?); - // update group column indices based on partial aggregate plan evaluation - let final_group: Vec> = - initial_aggr.output_group_expr(); - let can_repartition = !groups.is_empty() && session_state.config().target_partitions() > 1 && session_state.config().repartition_aggregations(); @@ -716,13 +712,7 @@ impl DefaultPhysicalPlanner { AggregateMode::Final }; - let final_grouping_set = PhysicalGroupBy::new_single( - final_group - .iter() - .enumerate() - .map(|(i, expr)| (expr.clone(), groups.expr()[i].1.clone())) - .collect(), - ); + let final_grouping_set = initial_aggr.group_expr().as_final(); Arc::new(AggregateExec::try_new( next_partition_mode, @@ -2345,7 +2335,7 @@ mod tests { .expect("hash aggregate"); assert_eq!( "sum(aggregate_test_100.c3)", - final_hash_agg.schema().field(2).name() + final_hash_agg.schema().field(3).name() ); // we need access to the input to the partial aggregate so that other projects can // implement serde diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 19e73140b75c..0292274e57ee 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -21,7 +21,7 @@ use std::cmp::Ordering; use std::collections::{HashMap, HashSet}; use std::fmt::{self, Debug, Display, Formatter}; use std::hash::{Hash, Hasher}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use super::dml::CopyTo; use super::DdlStatement; @@ -2965,6 +2965,15 @@ impl Aggregate { .into_iter() .map(|(q, f)| (q, f.as_ref().clone().with_nullable(true).into())) .collect::>(); + qualified_fields.push(( + None, + Field::new( + Self::INTERNAL_GROUPING_ID, + Self::grouping_id_type(qualified_fields.len()), + false, + ) + .into(), + )); } qualified_fields.extend(exprlist_to_fields(aggr_expr.as_slice(), &input)?); @@ -3016,9 +3025,19 @@ impl Aggregate { }) } + fn is_grouping_set(&self) -> bool { + matches!(self.group_expr.as_slice(), [Expr::GroupingSet(_)]) + } + /// Get the output expressions. fn output_expressions(&self) -> Result> { + static INTERNAL_ID_EXPR: OnceLock = OnceLock::new(); let mut exprs = grouping_set_to_exprlist(self.group_expr.as_slice())?; + if self.is_grouping_set() { + exprs.push(INTERNAL_ID_EXPR.get_or_init(|| { + Expr::Column(Column::from_name(Self::INTERNAL_GROUPING_ID)) + })); + } exprs.extend(self.aggr_expr.iter()); debug_assert!(exprs.len() == self.schema.fields().len()); Ok(exprs) @@ -3030,6 +3049,41 @@ impl Aggregate { pub fn group_expr_len(&self) -> Result { grouping_set_expr_count(&self.group_expr) } + + /// Returns the data type of the grouping id. + /// The grouping ID value is a bitmask where each set bit + /// indicates that the corresponding grouping expression is + /// null + pub fn grouping_id_type(group_exprs: usize) -> DataType { + if group_exprs <= 8 { + DataType::UInt8 + } else if group_exprs <= 16 { + DataType::UInt16 + } else if group_exprs <= 32 { + DataType::UInt32 + } else { + DataType::UInt64 + } + } + + /// Internal column used when the aggregation is a grouping set. + /// + /// This column contains a bitmask where each bit represents a grouping + /// expression. The least significant bit corresponds to the rightmost + /// grouping expression. A bit value of 0 indicates that the corresponding + /// column is included in the grouping set, while a value of 1 means it is excluded. + /// + /// For example, for the grouping expressions CUBE(a, b), the grouping ID + /// column will have the following values: + /// 0b00: Both `a` and `b` are included + /// 0b01: `b` is excluded + /// 0b10: `a` is excluded + /// 0b11: Both `a` and `b` are excluded + /// + /// This internal column is necessary because excluded columns are replaced + /// with `NULL` values. To handle these cases correctly, we must distinguish + /// between an actual `NULL` value in a column and a column being excluded from the set. + pub const INTERNAL_GROUPING_ID: &'static str = "__grouping_id"; } // Manual implementation needed because of `schema` field. Comparison excludes this field. diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index fa927595040e..02b36d0feab9 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -61,7 +61,17 @@ pub fn exprlist_to_columns(expr: &[Expr], accum: &mut HashSet) -> Result /// Count the number of distinct exprs in a list of group by expressions. If the /// first element is a `GroupingSet` expression then it must be the only expr. pub fn grouping_set_expr_count(group_expr: &[Expr]) -> Result { - grouping_set_to_exprlist(group_expr).map(|exprs| exprs.len()) + if let Some(Expr::GroupingSet(grouping_set)) = group_expr.first() { + if group_expr.len() > 1 { + return plan_err!( + "Invalid group by expressions, GroupingSet must be the only expression" + ); + } + // Groupings sets have an additional interal column for the grouping id + Ok(grouping_set.distinct_expr().len() + 1) + } else { + grouping_set_to_exprlist(group_expr).map(|exprs| exprs.len()) + } } /// The [power set] (or powerset) of a set S is the set of all subsets of S, \ diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index 1c22c2a4375a..74251e5caad2 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -355,7 +355,7 @@ mod tests { .build()?; // Should not be optimized - let expected = "Aggregate: groupBy=[[GROUPING SETS ((test.a), (test.b))]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, count(DISTINCT test.c):Int64]\ + let expected = "Aggregate: groupBy=[[GROUPING SETS ((test.a), (test.b))]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(plan, expected) @@ -373,7 +373,7 @@ mod tests { .build()?; // Should not be optimized - let expected = "Aggregate: groupBy=[[CUBE (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, count(DISTINCT test.c):Int64]\ + let expected = "Aggregate: groupBy=[[CUBE (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(plan, expected) @@ -392,7 +392,7 @@ mod tests { .build()?; // Should not be optimized - let expected = "Aggregate: groupBy=[[ROLLUP (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, count(DISTINCT test.c):Int64]\ + let expected = "Aggregate: groupBy=[[ROLLUP (test.a, test.b)]], aggr=[[count(DISTINCT test.c)]] [a:UInt32;N, b:UInt32;N, __grouping_id:UInt8, count(DISTINCT test.c):Int64]\ \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; assert_optimized_plan_equal(plan, expected) diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 9466ff6dd459..f9dd973c814e 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -36,10 +36,11 @@ use crate::{ use arrow::array::ArrayRef; use arrow::datatypes::{Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; +use arrow_array::{UInt16Array, UInt32Array, UInt64Array, UInt8Array}; use datafusion_common::stats::Precision; use datafusion_common::{internal_err, not_impl_err, Result}; use datafusion_execution::TaskContext; -use datafusion_expr::Accumulator; +use datafusion_expr::{Accumulator, Aggregate}; use datafusion_physical_expr::{ equivalence::{collapse_lex_req, ProjectionMapping}, expressions::Column, @@ -211,13 +212,99 @@ impl PhysicalGroupBy { .collect() } + /// The number of expressions in the output schema. + fn num_output_exprs(&self) -> usize { + let mut num_exprs = self.expr.len(); + if !self.is_single() { + num_exprs += 1 + } + num_exprs + } + /// Return grouping expressions as they occur in the output schema. pub fn output_exprs(&self) -> Vec> { - self.expr - .iter() - .enumerate() - .map(|(index, (_, name))| Arc::new(Column::new(name, index)) as _) - .collect() + let num_output_exprs = self.num_output_exprs(); + let mut output_exprs = Vec::with_capacity(num_output_exprs); + output_exprs.extend( + self.expr + .iter() + .enumerate() + .take(num_output_exprs) + .map(|(index, (_, name))| Arc::new(Column::new(name, index)) as _), + ); + if !self.is_single() { + output_exprs.push(Arc::new(Column::new( + Aggregate::INTERNAL_GROUPING_ID, + self.expr.len(), + )) as _); + } + output_exprs + } + + /// Returns the number expression as grouping keys. + fn num_group_exprs(&self) -> usize { + if self.is_single() { + self.expr.len() + } else { + self.expr.len() + 1 + } + } + + /// Returns the fields that are used as the grouping keys. + fn group_fields(&self, input_schema: &Schema) -> Result> { + let mut fields = Vec::with_capacity(self.num_group_exprs()); + for ((expr, name), group_expr_nullable) in + self.expr.iter().zip(self.exprs_nullable().into_iter()) + { + fields.push( + Field::new( + name, + expr.data_type(input_schema)?, + group_expr_nullable || expr.nullable(input_schema)?, + ) + .with_metadata( + get_field_metadata(expr, input_schema).unwrap_or_default(), + ), + ); + } + if !self.is_single() { + fields.push(Field::new( + Aggregate::INTERNAL_GROUPING_ID, + Aggregate::grouping_id_type(self.expr.len()), + false, + )); + } + Ok(fields) + } + + /// Returns the output fields of the group by. + /// + /// This might be different from the `group_fields` that might contain internal expressions that + /// should not be part of the output schema. + fn output_fields(&self, input_schema: &Schema) -> Result> { + let mut fields = self.group_fields(input_schema)?; + fields.truncate(self.num_output_exprs()); + Ok(fields) + } + + /// Returns the `PhysicalGroupBy` for a final aggregation if `self` is used for a partial + /// aggregation. + pub fn as_final(&self) -> PhysicalGroupBy { + let expr: Vec<_> = + self.output_exprs() + .into_iter() + .zip( + self.expr.iter().map(|t| t.1.clone()).chain(std::iter::once( + Aggregate::INTERNAL_GROUPING_ID.to_owned(), + )), + ) + .collect(); + let num_exprs = expr.len(); + Self { + expr, + null_expr: vec![], + groups: vec![vec![false; num_exprs]], + } } } @@ -321,13 +408,7 @@ impl AggregateExec { input: Arc, input_schema: SchemaRef, ) -> Result { - let schema = create_schema( - &input.schema(), - &group_by.expr, - &aggr_expr, - group_by.exprs_nullable(), - mode, - )?; + let schema = create_schema(&input.schema(), &group_by, &aggr_expr, mode)?; let schema = Arc::new(schema); AggregateExec::try_new_with_schema( @@ -789,25 +870,12 @@ impl ExecutionPlan for AggregateExec { fn create_schema( input_schema: &Schema, - group_expr: &[(Arc, String)], + group_by: &PhysicalGroupBy, aggr_expr: &[AggregateFunctionExpr], - group_expr_nullable: Vec, mode: AggregateMode, ) -> Result { - let mut fields = Vec::with_capacity(group_expr.len() + aggr_expr.len()); - for (index, (expr, name)) in group_expr.iter().enumerate() { - fields.push( - Field::new( - name, - expr.data_type(input_schema)?, - // In cases where we have multiple grouping sets, we will use NULL expressions in - // order to align the grouping sets. So the field must be nullable even if the underlying - // schema field is not. - group_expr_nullable[index] || expr.nullable(input_schema)?, - ) - .with_metadata(get_field_metadata(expr, input_schema).unwrap_or_default()), - ) - } + let mut fields = Vec::with_capacity(group_by.num_output_exprs() + aggr_expr.len()); + fields.extend(group_by.output_fields(input_schema)?); match mode { AggregateMode::Partial => { @@ -833,9 +901,8 @@ fn create_schema( )) } -fn group_schema(schema: &Schema, group_count: usize) -> SchemaRef { - let group_fields = schema.fields()[0..group_count].to_vec(); - Arc::new(Schema::new(group_fields)) +fn group_schema(input_schema: &Schema, group_by: &PhysicalGroupBy) -> Result { + Ok(Arc::new(Schema::new(group_by.group_fields(input_schema)?))) } /// Determines the lexical ordering requirement for an aggregate expression. @@ -1142,6 +1209,27 @@ fn evaluate_optional( .collect() } +fn group_id_array(group: &[bool], batch: &RecordBatch) -> Result { + if group.len() > 64 { + return not_impl_err!( + "Grouping sets with more than 64 columns are not supported" + ); + } + let group_id = group.iter().fold(0u64, |acc, &is_null| { + (acc << 1) | if is_null { 1 } else { 0 } + }); + let num_rows = batch.num_rows(); + if group.len() <= 8 { + Ok(Arc::new(UInt8Array::from(vec![group_id as u8; num_rows]))) + } else if group.len() <= 16 { + Ok(Arc::new(UInt16Array::from(vec![group_id as u16; num_rows]))) + } else if group.len() <= 32 { + Ok(Arc::new(UInt32Array::from(vec![group_id as u32; num_rows]))) + } else { + Ok(Arc::new(UInt64Array::from(vec![group_id; num_rows]))) + } +} + /// Evaluate a group by expression against a `RecordBatch` /// /// Arguments: @@ -1174,23 +1262,24 @@ pub(crate) fn evaluate_group_by( }) .collect::>>()?; - Ok(group_by + group_by .groups .iter() .map(|group| { - group - .iter() - .enumerate() - .map(|(idx, is_null)| { - if *is_null { - Arc::clone(&null_exprs[idx]) - } else { - Arc::clone(&exprs[idx]) - } - }) - .collect() + let mut group_values = Vec::with_capacity(group_by.num_group_exprs()); + group_values.extend(group.iter().enumerate().map(|(idx, is_null)| { + if *is_null { + Arc::clone(&null_exprs[idx]) + } else { + Arc::clone(&exprs[idx]) + } + })); + if !group_by.is_single() { + group_values.push(group_id_array(group, batch)?); + } + Ok(group_values) }) - .collect()) + .collect() } #[cfg(test)] @@ -1348,21 +1437,21 @@ mod tests { ) -> Result<()> { let input_schema = input.schema(); - let grouping_set = PhysicalGroupBy { - expr: vec![ + let grouping_set = PhysicalGroupBy::new( + vec![ (col("a", &input_schema)?, "a".to_string()), (col("b", &input_schema)?, "b".to_string()), ], - null_expr: vec![ + vec![ (lit(ScalarValue::UInt32(None)), "a".to_string()), (lit(ScalarValue::Float64(None)), "b".to_string()), ], - groups: vec![ + vec![ vec![false, true], // (a, NULL) vec![true, false], // (NULL, b) vec![false, false], // (a,b) ], - }; + ); let aggregates = vec![AggregateExprBuilder::new(count_udaf(), vec![lit(1i8)]) .schema(Arc::clone(&input_schema)) @@ -1392,63 +1481,56 @@ mod tests { // In spill mode, we test with the limited memory, if the mem usage exceeds, // we trigger the early emit rule, which turns out the partial aggregate result. vec![ - "+---+-----+-----------------+", - "| a | b | COUNT(1)[count] |", - "+---+-----+-----------------+", - "| | 1.0 | 1 |", - "| | 1.0 | 1 |", - "| | 2.0 | 1 |", - "| | 2.0 | 1 |", - "| | 3.0 | 1 |", - "| | 3.0 | 1 |", - "| | 4.0 | 1 |", - "| | 4.0 | 1 |", - "| 2 | | 1 |", - "| 2 | | 1 |", - "| 2 | 1.0 | 1 |", - "| 2 | 1.0 | 1 |", - "| 3 | | 1 |", - "| 3 | | 2 |", - "| 3 | 2.0 | 2 |", - "| 3 | 3.0 | 1 |", - "| 4 | | 1 |", - "| 4 | | 2 |", - "| 4 | 3.0 | 1 |", - "| 4 | 4.0 | 2 |", - "+---+-----+-----------------+", + "+---+-----+---------------+-----------------+", + "| a | b | __grouping_id | COUNT(1)[count] |", + "+---+-----+---------------+-----------------+", + "| | 1.0 | 2 | 1 |", + "| | 1.0 | 2 | 1 |", + "| | 2.0 | 2 | 1 |", + "| | 2.0 | 2 | 1 |", + "| | 3.0 | 2 | 1 |", + "| | 3.0 | 2 | 1 |", + "| | 4.0 | 2 | 1 |", + "| | 4.0 | 2 | 1 |", + "| 2 | | 1 | 1 |", + "| 2 | | 1 | 1 |", + "| 2 | 1.0 | 0 | 1 |", + "| 2 | 1.0 | 0 | 1 |", + "| 3 | | 1 | 1 |", + "| 3 | | 1 | 2 |", + "| 3 | 2.0 | 0 | 2 |", + "| 3 | 3.0 | 0 | 1 |", + "| 4 | | 1 | 1 |", + "| 4 | | 1 | 2 |", + "| 4 | 3.0 | 0 | 1 |", + "| 4 | 4.0 | 0 | 2 |", + "+---+-----+---------------+-----------------+", ] } else { vec![ - "+---+-----+-----------------+", - "| a | b | COUNT(1)[count] |", - "+---+-----+-----------------+", - "| | 1.0 | 2 |", - "| | 2.0 | 2 |", - "| | 3.0 | 2 |", - "| | 4.0 | 2 |", - "| 2 | | 2 |", - "| 2 | 1.0 | 2 |", - "| 3 | | 3 |", - "| 3 | 2.0 | 2 |", - "| 3 | 3.0 | 1 |", - "| 4 | | 3 |", - "| 4 | 3.0 | 1 |", - "| 4 | 4.0 | 2 |", - "+---+-----+-----------------+", + "+---+-----+---------------+-----------------+", + "| a | b | __grouping_id | COUNT(1)[count] |", + "+---+-----+---------------+-----------------+", + "| | 1.0 | 2 | 2 |", + "| | 2.0 | 2 | 2 |", + "| | 3.0 | 2 | 2 |", + "| | 4.0 | 2 | 2 |", + "| 2 | | 1 | 2 |", + "| 2 | 1.0 | 0 | 2 |", + "| 3 | | 1 | 3 |", + "| 3 | 2.0 | 0 | 2 |", + "| 3 | 3.0 | 0 | 1 |", + "| 4 | | 1 | 3 |", + "| 4 | 3.0 | 0 | 1 |", + "| 4 | 4.0 | 0 | 2 |", + "+---+-----+---------------+-----------------+", ] }; assert_batches_sorted_eq!(expected, &result); - let groups = partial_aggregate.group_expr().expr().to_vec(); - let merge = Arc::new(CoalescePartitionsExec::new(partial_aggregate)); - let final_group: Vec<(Arc, String)> = groups - .iter() - .map(|(_expr, name)| Ok((col(name, &input_schema)?, name.clone()))) - .collect::>()?; - - let final_grouping_set = PhysicalGroupBy::new_single(final_group); + let final_grouping_set = grouping_set.as_final(); let task_ctx = if spill { new_spill_ctx(4, 3160) @@ -1468,26 +1550,26 @@ mod tests { let result = common::collect(merged_aggregate.execute(0, Arc::clone(&task_ctx))?).await?; let batch = concat_batches(&result[0].schema(), &result)?; - assert_eq!(batch.num_columns(), 3); + assert_eq!(batch.num_columns(), 4); assert_eq!(batch.num_rows(), 12); let expected = vec![ - "+---+-----+----------+", - "| a | b | COUNT(1) |", - "+---+-----+----------+", - "| | 1.0 | 2 |", - "| | 2.0 | 2 |", - "| | 3.0 | 2 |", - "| | 4.0 | 2 |", - "| 2 | | 2 |", - "| 2 | 1.0 | 2 |", - "| 3 | | 3 |", - "| 3 | 2.0 | 2 |", - "| 3 | 3.0 | 1 |", - "| 4 | | 3 |", - "| 4 | 3.0 | 1 |", - "| 4 | 4.0 | 2 |", - "+---+-----+----------+", + "+---+-----+---------------+----------+", + "| a | b | __grouping_id | COUNT(1) |", + "+---+-----+---------------+----------+", + "| | 1.0 | 2 | 2 |", + "| | 2.0 | 2 | 2 |", + "| | 3.0 | 2 | 2 |", + "| | 4.0 | 2 | 2 |", + "| 2 | | 1 | 2 |", + "| 2 | 1.0 | 0 | 2 |", + "| 3 | | 1 | 3 |", + "| 3 | 2.0 | 0 | 2 |", + "| 3 | 3.0 | 0 | 1 |", + "| 4 | | 1 | 3 |", + "| 4 | 3.0 | 0 | 1 |", + "| 4 | 4.0 | 0 | 2 |", + "+---+-----+---------------+----------+", ]; assert_batches_sorted_eq!(&expected, &result); @@ -1503,11 +1585,11 @@ mod tests { async fn check_aggregates(input: Arc, spill: bool) -> Result<()> { let input_schema = input.schema(); - let grouping_set = PhysicalGroupBy { - expr: vec![(col("a", &input_schema)?, "a".to_string())], - null_expr: vec![], - groups: vec![vec![false]], - }; + let grouping_set = PhysicalGroupBy::new( + vec![(col("a", &input_schema)?, "a".to_string())], + vec![], + vec![vec![false]], + ); let aggregates: Vec = vec![ @@ -1563,13 +1645,7 @@ mod tests { let merge = Arc::new(CoalescePartitionsExec::new(partial_aggregate)); - let final_group: Vec<(Arc, String)> = grouping_set - .expr - .iter() - .map(|(_expr, name)| Ok((col(name, &input_schema)?, name.clone()))) - .collect::>()?; - - let final_grouping_set = PhysicalGroupBy::new_single(final_group); + let final_grouping_set = grouping_set.as_final(); let merged_aggregate = Arc::new(AggregateExec::try_new( AggregateMode::Final, @@ -1825,11 +1901,11 @@ mod tests { let task_ctx = Arc::new(task_ctx); let groups_none = PhysicalGroupBy::default(); - let groups_some = PhysicalGroupBy { - expr: vec![(col("a", &input_schema)?, "a".to_string())], - null_expr: vec![], - groups: vec![vec![false]], - }; + let groups_some = PhysicalGroupBy::new( + vec![(col("a", &input_schema)?, "a".to_string())], + vec![], + vec![vec![false]], + ); // something that allocates within the aggregator let aggregates_v0: Vec = @@ -2306,7 +2382,7 @@ mod tests { )?); let aggregate_exec = Arc::new(AggregateExec::try_new( - AggregateMode::Partial, + AggregateMode::Single, groups, aggregates.clone(), vec![None], @@ -2318,13 +2394,13 @@ mod tests { collect(aggregate_exec.execute(0, Arc::new(TaskContext::default()))?).await?; let expected = [ - "+-----+-----+-------+----------+", - "| a | b | const | 1[count] |", - "+-----+-----+-------+----------+", - "| | 0.0 | | 32768 |", - "| 0.0 | | | 32768 |", - "| | | 1 | 32768 |", - "+-----+-----+-------+----------+", + "+-----+-----+-------+---------------+-------+", + "| a | b | const | __grouping_id | 1 |", + "+-----+-----+-------+---------------+-------+", + "| | | 1 | 6 | 32768 |", + "| | 0.0 | | 5 | 32768 |", + "| 0.0 | | | 3 | 32768 |", + "+-----+-----+-------+---------------+-------+", ]; assert_batches_sorted_eq!(expected, &output); @@ -2638,30 +2714,30 @@ mod tests { .build()?, ]; - let grouping_set = PhysicalGroupBy { - expr: vec![ + let grouping_set = PhysicalGroupBy::new( + vec![ (col("a", &input_schema)?, "a".to_string()), (col("b", &input_schema)?, "b".to_string()), ], - null_expr: vec![ + vec![ (lit(ScalarValue::Float32(None)), "a".to_string()), (lit(ScalarValue::Float32(None)), "b".to_string()), ], - groups: vec![ + vec![ vec![false, true], // (a, NULL) vec![false, false], // (a,b) ], - }; + ); let aggr_schema = create_schema( &input_schema, - &grouping_set.expr, + &grouping_set, &aggr_expr, - grouping_set.exprs_nullable(), AggregateMode::Final, )?; let expected_schema = Schema::new(vec![ Field::new("a", DataType::Float32, false), Field::new("b", DataType::Float32, true), + Field::new("__grouping_id", DataType::UInt8, false), Field::new("COUNT(a)", DataType::Int64, false), ]); assert_eq!(aggr_schema, expected_schema); diff --git a/datafusion/physical-plan/src/aggregates/row_hash.rs b/datafusion/physical-plan/src/aggregates/row_hash.rs index 9e4968f1123e..5121e6cc3b35 100644 --- a/datafusion/physical-plan/src/aggregates/row_hash.rs +++ b/datafusion/physical-plan/src/aggregates/row_hash.rs @@ -449,13 +449,13 @@ impl GroupedHashAggregateStream { let aggregate_arguments = aggregates::aggregate_expressions( &agg.aggr_expr, &agg.mode, - agg_group_by.expr.len(), + agg_group_by.num_group_exprs(), )?; // arguments for aggregating spilled data is the same as the one for final aggregation let merging_aggregate_arguments = aggregates::aggregate_expressions( &agg.aggr_expr, &AggregateMode::Final, - agg_group_by.expr.len(), + agg_group_by.num_group_exprs(), )?; let filter_expressions = match agg.mode { @@ -473,7 +473,7 @@ impl GroupedHashAggregateStream { .map(create_group_accumulator) .collect::>()?; - let group_schema = group_schema(&agg_schema, agg_group_by.expr.len()); + let group_schema = group_schema(&agg.input().schema(), &agg_group_by)?; let spill_expr = group_schema .fields .into_iter() diff --git a/datafusion/sql/src/unparser/utils.rs b/datafusion/sql/src/unparser/utils.rs index 8b2530a7499b..e05df8ba77fc 100644 --- a/datafusion/sql/src/unparser/utils.rs +++ b/datafusion/sql/src/unparser/utils.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use std::cmp::Ordering; + use datafusion_common::{ internal_err, tree_node::{Transformed, TreeNode}, @@ -169,10 +171,17 @@ fn find_agg_expr<'a>(agg: &'a Aggregate, column: &Column) -> Result Ok(grouping_expr.into_iter().nth(index)), + Ordering::Equal => { + internal_err!( + "Tried to unproject column refereing to internal grouping id" + ) + } + Ordering::Greater => { + Ok(agg.aggr_expr.get(index - grouping_expr.len() - 1)) + } + } } else { Ok(agg.group_expr.iter().chain(agg.aggr_expr.iter()).nth(index)) } diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index a78ade81eeba..250fa85cddef 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -3520,6 +3520,18 @@ SELECT MIN(value), MAX(value) FROM integers_with_nulls ---- 1 5 +# grouping_sets with null values +query II rowsort +SELECT value, min(value) FROM integers_with_nulls GROUP BY CUBE(value) +---- +1 1 +3 3 +4 4 +5 5 +NULL 1 +NULL NULL + + statement ok DROP TABLE integers_with_nulls; @@ -4879,16 +4891,18 @@ query TT EXPLAIN SELECT c2, c3 FROM aggregate_test_100 group by rollup(c2, c3) limit 3; ---- logical_plan -01)Limit: skip=0, fetch=3 -02)--Aggregate: groupBy=[[ROLLUP (aggregate_test_100.c2, aggregate_test_100.c3)]], aggr=[[]] -03)----TableScan: aggregate_test_100 projection=[c2, c3] +01)Projection: aggregate_test_100.c2, aggregate_test_100.c3 +02)--Limit: skip=0, fetch=3 +03)----Aggregate: groupBy=[[ROLLUP (aggregate_test_100.c2, aggregate_test_100.c3)]], aggr=[[]] +04)------TableScan: aggregate_test_100 projection=[c2, c3] physical_plan -01)GlobalLimitExec: skip=0, fetch=3 -02)--AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3], aggr=[], lim=[3] -03)----CoalescePartitionsExec -04)------AggregateExec: mode=Partial, gby=[(NULL as c2, NULL as c3), (c2@0 as c2, NULL as c3), (c2@0 as c2, c3@1 as c3)], aggr=[] -05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -06)----------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true +01)ProjectionExec: expr=[c2@0 as c2, c3@1 as c3] +02)--GlobalLimitExec: skip=0, fetch=3 +03)----AggregateExec: mode=Final, gby=[c2@0 as c2, c3@1 as c3, __grouping_id@2 as __grouping_id], aggr=[], lim=[3] +04)------CoalescePartitionsExec +05)--------AggregateExec: mode=Partial, gby=[(NULL as c2, NULL as c3), (c2@0 as c2, NULL as c3), (c2@0 as c2, c3@1 as c3)], aggr=[] +06)----------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +07)------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c2, c3], has_header=true query II SELECT c2, c3 FROM aggregate_test_100 group by rollup(c2, c3) limit 3; diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index f561fa9e9ac8..a80a0891e977 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -5152,8 +5152,6 @@ drop table test_case_expr statement ok drop table t; -# TODO: Current grouping set result is not align with Postgres and DuckDB, we might want to change the result -# See https://github.com/apache/datafusion/issues/12570 # test multi group by for binary type with nulls statement ok create table t(a int, b bytea) as values (1, 0xa), (1, 0xa), (2, null), (null, 0xb), (null, 0xb); @@ -5162,11 +5160,14 @@ query I?I select a, b, count(*) from t group by grouping sets ((a, b), (a), (b)); ---- 1 0a 2 -2 NULL 2 -NULL 0b 4 +2 NULL 1 +NULL 0b 2 1 NULL 2 -NULL NULL 3 +2 NULL 1 +NULL NULL 2 NULL 0a 2 +NULL NULL 1 +NULL 0b 2 statement ok drop table t; diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index 3b7d0fd29610..ce6d1825cd25 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -294,8 +294,9 @@ async fn aggregate_grouping_sets() -> Result<()> { async fn aggregate_grouping_rollup() -> Result<()> { assert_expected_plan( "SELECT a, c, e, avg(b) FROM data GROUP BY ROLLUP (a, c, e)", - "Aggregate: groupBy=[[GROUPING SETS ((data.a, data.c, data.e), (data.a, data.c), (data.a), ())]], aggr=[[avg(data.b)]]\ - \n TableScan: data projection=[a, b, c, e]", + "Projection: data.a, data.c, data.e, avg(data.b)\ + \n Aggregate: groupBy=[[GROUPING SETS ((data.a, data.c, data.e), (data.a, data.c), (data.a), ())]], aggr=[[avg(data.b)]]\ + \n TableScan: data projection=[a, b, c, e]", true ).await } From 5360d206f39253f80179214e67dbac3f339b26ce Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Mon, 7 Oct 2024 13:28:08 -0400 Subject: [PATCH 21/22] Migrate documentation for all string functions from scalar_functions.md to code (#12775) * Added documentation for string and unicode functions. * Fixed issues with aliases. * Cargo fmt. * Minor doc fixes. * Update docs for var_pop/samp --------- Co-authored-by: Andrew Lamb --- .../core/src/bin/print_functions_docs.rs | 9 +- datafusion/functions/src/string/ascii.rs | 51 +- datafusion/functions/src/string/bit_length.rs | 40 +- datafusion/functions/src/string/btrim.rs | 41 +- datafusion/functions/src/string/chr.rs | 39 +- datafusion/functions/src/string/concat.rs | 40 +- datafusion/functions/src/string/concat_ws.rs | 44 +- datafusion/functions/src/string/contains.rs | 36 +- datafusion/functions/src/string/ends_with.rs | 43 +- datafusion/functions/src/string/initcap.rs | 36 +- .../functions/src/string/levenshtein.rs | 32 +- datafusion/functions/src/string/lower.rs | 41 +- datafusion/functions/src/string/ltrim.rs | 47 +- .../functions/src/string/octet_length.rs | 40 +- datafusion/functions/src/string/overlay.rs | 37 +- datafusion/functions/src/string/repeat.rs | 41 +- datafusion/functions/src/string/replace.rs | 36 +- datafusion/functions/src/string/rtrim.rs | 47 +- datafusion/functions/src/string/split_part.rs | 36 +- .../functions/src/string/starts_with.rs | 37 +- datafusion/functions/src/string/to_hex.rs | 36 +- datafusion/functions/src/string/upper.rs | 34 +- datafusion/functions/src/string/uuid.rs | 30 +- .../functions/src/unicode/character_length.rs | 37 +- .../functions/src/unicode/find_in_set.rs | 37 +- datafusion/functions/src/unicode/left.rs | 38 +- datafusion/functions/src/unicode/lpad.rs | 41 +- datafusion/functions/src/unicode/reverse.rs | 38 +- datafusion/functions/src/unicode/right.rs | 35 +- datafusion/functions/src/unicode/rpad.rs | 50 +- datafusion/functions/src/unicode/strpos.rs | 34 +- datafusion/functions/src/unicode/substr.rs | 35 +- .../functions/src/unicode/substrindex.rs | 46 +- datafusion/functions/src/unicode/translate.rs | 35 +- .../user-guide/sql/aggregate_functions_new.md | 51 +- .../source/user-guide/sql/scalar_functions.md | 626 +----------- .../user-guide/sql/scalar_functions_new.md | 927 +++++++++++++++++- 37 files changed, 2076 insertions(+), 827 deletions(-) diff --git a/datafusion/core/src/bin/print_functions_docs.rs b/datafusion/core/src/bin/print_functions_docs.rs index 92737b244a64..53cfe94ecab3 100644 --- a/datafusion/core/src/bin/print_functions_docs.rs +++ b/datafusion/core/src/bin/print_functions_docs.rs @@ -130,13 +130,14 @@ fn print_docs( .find(|f| f.get_name() == name || f.get_aliases().contains(&name)) .unwrap(); - let name = f.get_name(); let aliases = f.get_aliases(); let documentation = f.get_documentation(); // if this name is an alias we need to display what it's an alias of if aliases.contains(&name) { - let _ = write!(docs, "_Alias of [{name}](#{name})._"); + let fname = f.get_name(); + let _ = writeln!(docs, r#"### `{name}`"#); + let _ = writeln!(docs, "_Alias of [{fname}](#{fname})._"); continue; } @@ -183,10 +184,10 @@ fn print_docs( // next, aliases if !f.get_aliases().is_empty() { - let _ = write!(docs, "#### Aliases"); + let _ = writeln!(docs, "#### Aliases"); for alias in f.get_aliases() { - let _ = writeln!(docs, "- {alias}"); + let _ = writeln!(docs, "- {}", alias.replace("_", r#"\_"#)); } } diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs index d01c6631e9dd..1e828d066786 100644 --- a/datafusion/functions/src/string/ascii.rs +++ b/datafusion/functions/src/string/ascii.rs @@ -26,24 +26,6 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::{Arc, OnceLock}; -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_ascii_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder() - .with_doc_section(DOC_SECTION_STRING) - .with_description("Returns the ASCII value of the first character in a string.") - .with_syntax_example("ascii(str)") - .with_argument( - "str", - "String expression to operate on. Can be a constant, column, or function that evaluates to or can be coerced to a Utf8, LargeUtf8 or a Utf8View.", - ) - .with_related_udf("chr") - .build() - .unwrap() - }) -} - #[derive(Debug)] pub struct AsciiFunc { signature: Signature, @@ -96,6 +78,39 @@ impl ScalarUDFImpl for AsciiFunc { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_ascii_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description( + "Returns the Unicode character code of the first character in a string.", + ) + .with_syntax_example("ascii(str)") + .with_sql_example( + r#"```sql +> select ascii('abc'); ++--------------------+ +| ascii(Utf8("abc")) | ++--------------------+ +| 97 | ++--------------------+ +> select ascii('🚀'); ++-------------------+ +| ascii(Utf8("🚀")) | ++-------------------+ +| 128640 | ++-------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("chr") + .build() + .unwrap() + }) +} + fn calculate_ascii<'a, V>(array: V) -> Result where V: ArrayAccessor, diff --git a/datafusion/functions/src/string/bit_length.rs b/datafusion/functions/src/string/bit_length.rs index 65ec1a4a7734..bd22c1504baf 100644 --- a/datafusion/functions/src/string/bit_length.rs +++ b/datafusion/functions/src/string/bit_length.rs @@ -15,17 +15,17 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use arrow::compute::kernels::length::bit_length; use arrow::datatypes::DataType; +use std::any::Any; +use std::sync::OnceLock; +use crate::utils::utf8_to_int_type; use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::utf8_to_int_type; - #[derive(Debug)] pub struct BitLengthFunc { signature: Signature, @@ -88,4 +88,34 @@ impl ScalarUDFImpl for BitLengthFunc { }, } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_bit_length_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_bit_length_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the bit length of a string.") + .with_syntax_example("bit_length(str)") + .with_sql_example( + r#"```sql +> select bit_length('datafusion'); ++--------------------------------+ +| bit_length(Utf8("datafusion")) | ++--------------------------------+ +| 80 | ++--------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("length") + .with_related_udf("octet_length") + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index 0e992ff27fd3..b2e79a7b8930 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -15,18 +15,18 @@ // specific language governing permissions and limitations // under the License. +use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ArrayRef, OffsetSizeTrait}; use arrow::datatypes::DataType; -use std::any::Any; - use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; - -use crate::string::common::*; -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use std::any::Any; +use std::sync::OnceLock; /// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed. /// btrim('xyxtrimyyx', 'xyz') = 'trim' @@ -109,6 +109,35 @@ impl ScalarUDFImpl for BTrimFunc { fn aliases(&self) -> &[String] { &self.aliases } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_btrim_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_btrim_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Trims the specified trim string from the start and end of a string. If no trim string is provided, all whitespace is removed from the start and end of the input string.") + .with_syntax_example("btrim(str[, trim_str])") + .with_sql_example(r#"```sql +> select btrim('__datafusion____', '_'); ++-------------------------------------------+ +| btrim(Utf8("__datafusion____"),Utf8("_")) | ++-------------------------------------------+ +| datafusion | ++-------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("trim_str", "String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is whitespace characters._") + .with_related_udf("ltrim") + .with_related_udf("rtrim") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/chr.rs b/datafusion/functions/src/string/chr.rs index 4da7dc01594d..ae0900af37d3 100644 --- a/datafusion/functions/src/string/chr.rs +++ b/datafusion/functions/src/string/chr.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::ArrayRef; use arrow::array::StringArray; @@ -24,13 +24,13 @@ use arrow::datatypes::DataType; use arrow::datatypes::DataType::Int64; use arrow::datatypes::DataType::Utf8; +use crate::utils::make_scalar_function; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::make_scalar_function; - /// Returns the character with the given code. chr(0) is disallowed because text data types cannot store that character. /// chr(65) = 'A' pub fn chr(args: &[ArrayRef]) -> Result { @@ -99,4 +99,35 @@ impl ScalarUDFImpl for ChrFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(chr, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_chr_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_chr_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description( + "Returns the character with the specified ASCII or Unicode code value.", + ) + .with_syntax_example("chr(expression)") + .with_sql_example( + r#"```sql +> select chr(128640); ++--------------------+ +| chr(Int64(128640)) | ++--------------------+ +| 🚀 | ++--------------------+ +```"#, + ) + .with_standard_argument("expression", "String") + .with_related_udf("ascii") + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/string/concat.rs b/datafusion/functions/src/string/concat.rs index 98f57efef90d..228fcd460c97 100644 --- a/datafusion/functions/src/string/concat.rs +++ b/datafusion/functions/src/string/concat.rs @@ -18,18 +18,18 @@ use arrow::array::{as_largestring_array, Array}; use arrow::datatypes::DataType; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; +use crate::string::common::*; +use crate::string::concat; use datafusion_common::cast::{as_string_array, as_string_view_array}; use datafusion_common::{internal_err, plan_err, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; -use datafusion_expr::{lit, ColumnarValue, Expr, Volatility}; +use datafusion_expr::{lit, ColumnarValue, Documentation, Expr, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; -use crate::string::concat; - #[derive(Debug)] pub struct ConcatFunc { signature: Signature, @@ -244,6 +244,36 @@ impl ScalarUDFImpl for ConcatFunc { ) -> Result { simplify_concat(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_concat_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_concat_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Concatenates multiple strings together.") + .with_syntax_example("concat(str[, ..., str_n])") + .with_sql_example( + r#"```sql +> select concat('data', 'f', 'us', 'ion'); ++-------------------------------------------------------+ +| concat(Utf8("data"),Utf8("f"),Utf8("us"),Utf8("ion")) | ++-------------------------------------------------------+ +| datafusion | ++-------------------------------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_argument("str_n", "Subsequent string expressions to concatenate.") + .with_related_udf("concat_ws") + .build() + .unwrap() + }) } pub fn simplify_concat(args: Vec) -> Result { diff --git a/datafusion/functions/src/string/concat_ws.rs b/datafusion/functions/src/string/concat_ws.rs index 1134c525cfca..a20cbf1a16f5 100644 --- a/datafusion/functions/src/string/concat_ws.rs +++ b/datafusion/functions/src/string/concat_ws.rs @@ -17,7 +17,7 @@ use arrow::array::{as_largestring_array, Array, StringArray}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::datatypes::DataType; @@ -27,8 +27,9 @@ use crate::string::concat_ws; use datafusion_common::cast::{as_string_array, as_string_view_array}; use datafusion_common::{exec_err, internal_err, plan_err, Result, ScalarValue}; use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; -use datafusion_expr::{lit, ColumnarValue, Expr, Volatility}; +use datafusion_expr::{lit, ColumnarValue, Documentation, Expr, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; #[derive(Debug)] @@ -264,6 +265,45 @@ impl ScalarUDFImpl for ConcatWsFunc { _ => Ok(ExprSimplifyResult::Original(args)), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_concat_ws_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_concat_ws_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description( + "Concatenates multiple strings together with a specified separator.", + ) + .with_syntax_example("concat_ws(separator, str[, ..., str_n])") + .with_sql_example( + r#"```sql +> select concat_ws('_', 'data', 'fusion'); ++--------------------------------------------------+ +| concat_ws(Utf8("_"),Utf8("data"),Utf8("fusion")) | ++--------------------------------------------------+ +| data_fusion | ++--------------------------------------------------+ +```"#, + ) + .with_argument( + "separator", + "Separator to insert between concatenated strings.", + ) + .with_standard_argument("str", "String") + .with_standard_argument( + "str_n", + "Subsequent string expressions to concatenate.", + ) + .with_related_udf("concat") + .build() + .unwrap() + }) } fn simplify_concat_ws(delimiter: &Expr, args: &[Expr]) -> Result { diff --git a/datafusion/functions/src/string/contains.rs b/datafusion/functions/src/string/contains.rs index 722451ab5344..7fc1fa876c11 100644 --- a/datafusion/functions/src/string/contains.rs +++ b/datafusion/functions/src/string/contains.rs @@ -23,13 +23,14 @@ use arrow::datatypes::DataType::{Boolean, LargeUtf8, Utf8, Utf8View}; use datafusion_common::exec_err; use datafusion_common::DataFusionError; use datafusion_common::Result; -use datafusion_expr::ScalarUDFImpl; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, Signature, Volatility}; +use datafusion_expr::{Documentation, ScalarUDFImpl}; use arrow::compute::regexp_is_match; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct ContainsFunc { @@ -84,6 +85,37 @@ impl ScalarUDFImpl for ContainsFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(contains, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_contains_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_contains_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description( + "Return true if search_str is found within string (case-sensitive).", + ) + .with_syntax_example("contains(str, search_str)") + .with_sql_example( + r#"```sql +> select contains('the quick brown fox', 'row'); ++---------------------------------------------------+ +| contains(Utf8("the quick brown fox"),Utf8("row")) | ++---------------------------------------------------+ +| true | ++---------------------------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_argument("search_str", "The string to search for in str.") + .build() + .unwrap() + }) } /// use regexp_is_match_utf8_scalar to do the calculation for contains diff --git a/datafusion/functions/src/string/ends_with.rs b/datafusion/functions/src/string/ends_with.rs index 03a1795954d0..786010764cc3 100644 --- a/datafusion/functions/src/string/ends_with.rs +++ b/datafusion/functions/src/string/ends_with.rs @@ -16,18 +16,18 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::ArrayRef; use arrow::datatypes::DataType; +use crate::utils::make_scalar_function; use datafusion_common::{internal_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::make_scalar_function; - #[derive(Debug)] pub struct EndsWithFunc { signature: Signature, @@ -84,6 +84,41 @@ impl ScalarUDFImpl for EndsWithFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_ends_with_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_ends_with_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Tests if a string ends with a substring.") + .with_syntax_example("ends_with(str, substr)") + .with_sql_example( + r#"```sql +> select ends_with('datafusion', 'soin'); ++--------------------------------------------+ +| ends_with(Utf8("datafusion"),Utf8("soin")) | ++--------------------------------------------+ +| false | ++--------------------------------------------+ +> select ends_with('datafusion', 'sion'); ++--------------------------------------------+ +| ends_with(Utf8("datafusion"),Utf8("sion")) | ++--------------------------------------------+ +| true | ++--------------------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_argument("substr", "Substring to test for.") + .build() + .unwrap() + }) } /// Returns true if string ends with suffix. diff --git a/datafusion/functions/src/string/initcap.rs b/datafusion/functions/src/string/initcap.rs index 4e1eb213ef57..ffd60bb6e979 100644 --- a/datafusion/functions/src/string/initcap.rs +++ b/datafusion/functions/src/string/initcap.rs @@ -16,18 +16,18 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray}; use arrow::datatypes::DataType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct InitcapFunc { signature: Signature, @@ -79,6 +79,34 @@ impl ScalarUDFImpl for InitcapFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_initcap_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_initcap_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Capitalizes the first character in each word in the input string. Words are delimited by non-alphanumeric characters.") + .with_syntax_example("initcap(str)") + .with_sql_example(r#"```sql +> select initcap('apache datafusion'); ++------------------------------------+ +| initcap(Utf8("apache datafusion")) | ++------------------------------------+ +| Apache Datafusion | ++------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_related_udf("lower") + .with_related_udf("upper") + .build() + .unwrap() + }) } /// Converts the first letter of each word to upper case and the rest to lower case. Words are sequences of alphanumeric characters separated by non-alphanumeric characters. diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs index 430c402a50c5..2f121426f1f8 100644 --- a/datafusion/functions/src/string/levenshtein.rs +++ b/datafusion/functions/src/string/levenshtein.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, Int32Array, Int64Array, OffsetSizeTrait}; use arrow::datatypes::DataType; @@ -25,8 +25,9 @@ use crate::utils::{make_scalar_function, utf8_to_int_type}; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::utils::datafusion_strsim; use datafusion_common::{exec_err, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] @@ -83,6 +84,33 @@ impl ScalarUDFImpl for LevenshteinFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_levenshtein_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_levenshtein_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the [`Levenshtein distance`](https://en.wikipedia.org/wiki/Levenshtein_distance) between the two given strings.") + .with_syntax_example("levenshtein(str1, str2)") + .with_sql_example(r#"```sql +> select levenshtein('kitten', 'sitting'); ++---------------------------------------------+ +| levenshtein(Utf8("kitten"),Utf8("sitting")) | ++---------------------------------------------+ +| 3 | ++---------------------------------------------+ +```"#) + .with_argument("str1", "String expression to compute Levenshtein distance with str2.") + .with_argument("str2", "String expression to compute Levenshtein distance with str1.") + .build() + .unwrap() + }) } ///Returns the Levenshtein distance between the two given strings. diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs index ca324e69c0d2..25acfc276013 100644 --- a/datafusion/functions/src/string/lower.rs +++ b/datafusion/functions/src/string/lower.rs @@ -15,16 +15,16 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use arrow::datatypes::DataType; - -use datafusion_common::Result; -use datafusion_expr::ColumnarValue; -use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; +use std::any::Any; +use std::sync::OnceLock; use crate::string::common::to_lower; use crate::utils::utf8_to_str_type; +use datafusion_common::Result; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation}; +use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] pub struct LowerFunc { @@ -70,8 +70,37 @@ impl ScalarUDFImpl for LowerFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { to_lower(args, "lower") } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_lower_doc()) + } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_lower_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Converts a string to lower-case.") + .with_syntax_example("lower(str)") + .with_sql_example( + r#"```sql +> select lower('Ångström'); ++-------------------------+ +| lower(Utf8("Ångström")) | ++-------------------------+ +| ångström | ++-------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("initcap") + .with_related_udf("upper") + .build() + .unwrap() + }) +} #[cfg(test)] mod tests { use super::*; diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index 0ddb5a205bac..1fcde9e97a1d 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -15,20 +15,20 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use arrow::array::{ArrayRef, OffsetSizeTrait}; use arrow::datatypes::DataType; +use std::any::Any; +use std::sync::OnceLock; +use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - /// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed. /// ltrim('zzzytest', 'xyz') = 'test' fn ltrim(args: &[ArrayRef]) -> Result { @@ -104,6 +104,41 @@ impl ScalarUDFImpl for LtrimFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_ltrim_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_ltrim_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Trims the specified trim string from the beginning of a string. If no trim string is provided, all whitespace is removed from the start of the input string.") + .with_syntax_example("ltrim(str[, trim_str])") + .with_sql_example(r#"```sql +> select ltrim(' datafusion '); ++-------------------------------+ +| ltrim(Utf8(" datafusion ")) | ++-------------------------------+ +| datafusion | ++-------------------------------+ +> select ltrim('___datafusion___', '_'); ++-------------------------------------------+ +| ltrim(Utf8("___datafusion___"),Utf8("_")) | ++-------------------------------------------+ +| datafusion___ | ++-------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("trim_str", "String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._") + .with_related_udf("btrim") + .with_related_udf("rtrim") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index f792914d862e..195a6c296c47 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -15,17 +15,17 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; - use arrow::compute::kernels::length::length; use arrow::datatypes::DataType; +use std::any::Any; +use std::sync::OnceLock; +use crate::utils::utf8_to_int_type; use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::utf8_to_int_type; - #[derive(Debug)] pub struct OctetLengthFunc { signature: Signature, @@ -91,6 +91,36 @@ impl ScalarUDFImpl for OctetLengthFunc { }, } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_octet_length_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_octet_length_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the length of a string in bytes.") + .with_syntax_example("octet_length(str)") + .with_sql_example( + r#"```sql +> select octet_length('Ångström'); ++--------------------------------+ +| octet_length(Utf8("Ångström")) | ++--------------------------------+ +| 10 | ++--------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("bit_length") + .with_related_udf("length") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/overlay.rs b/datafusion/functions/src/string/overlay.rs index e285bd85b197..ec33840a0b0e 100644 --- a/datafusion/functions/src/string/overlay.rs +++ b/datafusion/functions/src/string/overlay.rs @@ -16,21 +16,21 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; use arrow::datatypes::DataType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{ as_generic_string_array, as_int64_array, as_string_view_array, }; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct OverlayFunc { signature: Signature, @@ -87,6 +87,35 @@ impl ScalarUDFImpl for OverlayFunc { other => exec_err!("Unsupported data type {other:?} for function overlay"), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_overlay_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_overlay_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the string which is replaced by another string from the specified position and specified count length.") + .with_syntax_example("overlay(str PLACING substr FROM pos [FOR count])") + .with_sql_example(r#"```sql +> select overlay('Txxxxas' placing 'hom' from 2 for 4); ++--------------------------------------------------------+ +| overlay(Utf8("Txxxxas"),Utf8("hom"),Int64(2),Int64(4)) | ++--------------------------------------------------------+ +| Thomas | ++--------------------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("substr", "Substring to replace in str.") + .with_argument("pos", "The start position to start the replace in str.") + .with_argument("count", "The count of characters to be replaced from start position of str. If not specified, will use substr length instead.") + .build() + .unwrap() + }) } macro_rules! process_overlay { diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index 20e4462784b8..3abd1767bb0a 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, @@ -25,15 +25,15 @@ use arrow::array::{ use arrow::datatypes::DataType; use arrow::datatypes::DataType::{Int64, LargeUtf8, Utf8, Utf8View}; +use crate::string::common::StringArrayType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::StringArrayType; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct RepeatFunc { signature: Signature, @@ -83,6 +83,37 @@ impl ScalarUDFImpl for RepeatFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(repeat, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_repeat_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_repeat_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description( + "Returns a string with an input string repeated a specified number.", + ) + .with_syntax_example("repeat(str, n)") + .with_sql_example( + r#"```sql +> select repeat('data', 3); ++-------------------------------+ +| repeat(Utf8("data"),Int64(3)) | ++-------------------------------+ +| datadatadata | ++-------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_argument("n", "Number of times to repeat the input string.") + .build() + .unwrap() + }) } /// Repeats string the specified number of times. diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index 13fa3d55672d..7c985b44ab9a 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -16,19 +16,19 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray}; use arrow::datatypes::DataType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct ReplaceFunc { signature: Signature, @@ -83,6 +83,34 @@ impl ScalarUDFImpl for ReplaceFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_replace_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_replace_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Replaces all occurrences of a specified substring in a string with a new substring.") + .with_syntax_example("replace(str, substr, replacement)") + .with_sql_example(r#"```sql +> select replace('ABabbaBA', 'ab', 'cd'); ++-------------------------------------------------+ +| replace(Utf8("ABabbaBA"),Utf8("ab"),Utf8("cd")) | ++-------------------------------------------------+ +| ABcdbaBA | ++-------------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_standard_argument("substr", "Substring expression to replace in the input string. Substring expression") + .with_standard_argument("replacement", "Replacement substring") + .build() + .unwrap() + }) } fn replace_view(args: &[ArrayRef]) -> Result { diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index a1aa5568babb..6743ad99d3bc 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -16,19 +16,19 @@ // under the License. use arrow::array::{ArrayRef, OffsetSizeTrait}; -use std::any::Any; - use arrow::datatypes::DataType; +use std::any::Any; +use std::sync::OnceLock; +use crate::string::common::*; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; use datafusion_expr::function::Hint; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; -use crate::string::common::*; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - /// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed. /// rtrim('testxxzx', 'xyz') = 'test' fn rtrim(args: &[ArrayRef]) -> Result { @@ -104,6 +104,41 @@ impl ScalarUDFImpl for RtrimFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_rtrim_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_rtrim_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Trims the specified trim string from the end of a string. If no trim string is provided, all whitespace is removed from the end of the input string.") + .with_syntax_example("rtrim(str[, trim_str])") + .with_sql_example(r#"```sql +> select rtrim(' datafusion '); ++-------------------------------+ +| rtrim(Utf8(" datafusion ")) | ++-------------------------------+ +| datafusion | ++-------------------------------+ +> select rtrim('___datafusion___', '_'); ++-------------------------------------------+ +| rtrim(Utf8("___datafusion___"),Utf8("_")) | ++-------------------------------------------+ +| ___datafusion | ++-------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("trim_str", "String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._") + .with_related_udf("btrim") + .with_related_udf("ltrim") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs index 8d292315a35a..2424103c84bf 100644 --- a/datafusion/functions/src/string/split_part.rs +++ b/datafusion/functions/src/string/split_part.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::utils::utf8_to_str_type; use arrow::array::{ ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait, StringViewArray, }; @@ -23,13 +24,12 @@ use arrow::datatypes::DataType; use datafusion_common::cast::as_int64_array; use datafusion_common::ScalarValue; use datafusion_common::{exec_err, DataFusionError, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; use std::any::Any; -use std::sync::Arc; - -use crate::utils::utf8_to_str_type; +use std::sync::{Arc, OnceLock}; use super::common::StringArrayType; @@ -178,6 +178,34 @@ impl ScalarUDFImpl for SplitPartFunc { result.map(ColumnarValue::Array) } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_split_part_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_split_part_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Splits a string based on a specified delimiter and returns the substring in the specified position.") + .with_syntax_example("split_part(str, delimiter, pos)") + .with_sql_example(r#"```sql +> select split_part('1.2.3.4.5', '.', 3); ++--------------------------------------------------+ +| split_part(Utf8("1.2.3.4.5"),Utf8("."),Int64(3)) | ++--------------------------------------------------+ +| 3 | ++--------------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("delimiter", "String or character to split on.") + .with_argument("pos", "Position of the part to return.") + .build() + .unwrap() + }) } /// impl diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index 8450697cbf30..ff4bf01c993f 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -16,18 +16,18 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::ArrayRef; use arrow::datatypes::DataType; +use crate::utils::make_scalar_function; use datafusion_common::{internal_err, Result}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::*; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use crate::utils::make_scalar_function; - /// Returns true if string starts with prefix. /// starts_with('alphabet', 'alph') = 't' pub fn starts_with(args: &[ArrayRef]) -> Result { @@ -89,6 +89,35 @@ impl ScalarUDFImpl for StartsWithFunc { _ => internal_err!("Unsupported data types for starts_with. Expected Utf8, LargeUtf8 or Utf8View")?, } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_starts_with_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_starts_with_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Tests if a string starts with a substring.") + .with_syntax_example("starts_with(str, substr)") + .with_sql_example( + r#"```sql +> select starts_with('datafusion','data'); ++----------------------------------------------+ +| starts_with(Utf8("datafusion"),Utf8("data")) | ++----------------------------------------------+ +| true | ++----------------------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_argument("substr", "Substring to test for.") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/to_hex.rs b/datafusion/functions/src/string/to_hex.rs index 79aa9254f9b1..72cd4fbffa33 100644 --- a/datafusion/functions/src/string/to_hex.rs +++ b/datafusion/functions/src/string/to_hex.rs @@ -16,21 +16,21 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; use arrow::datatypes::{ ArrowNativeType, ArrowPrimitiveType, DataType, Int32Type, Int64Type, }; +use crate::utils::make_scalar_function; use datafusion_common::cast::as_primitive_array; use datafusion_common::Result; use datafusion_common::{exec_err, plan_err}; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use crate::utils::make_scalar_function; - /// Converts the number to its equivalent hexadecimal representation. /// to_hex(2147483647) = '7fffffff' pub fn to_hex(args: &[ArrayRef]) -> Result @@ -110,6 +110,34 @@ impl ScalarUDFImpl for ToHexFunc { other => exec_err!("Unsupported data type {other:?} for function to_hex"), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_to_hex_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_to_hex_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Converts an integer to a hexadecimal string.") + .with_syntax_example("to_hex(int)") + .with_sql_example( + r#"```sql +> select to_hex(12345689); ++-------------------------+ +| to_hex(Int64(12345689)) | ++-------------------------+ +| bc6159 | ++-------------------------+ +```"#, + ) + .with_standard_argument("int", "Integer") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs index 593e33ab6bb4..caef7f655222 100644 --- a/datafusion/functions/src/string/upper.rs +++ b/datafusion/functions/src/string/upper.rs @@ -19,9 +19,11 @@ use crate::string::common::to_upper; use crate::utils::utf8_to_str_type; use arrow::datatypes::DataType; use datafusion_common::Result; -use datafusion_expr::ColumnarValue; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; +use std::sync::OnceLock; #[derive(Debug)] pub struct UpperFunc { @@ -67,6 +69,36 @@ impl ScalarUDFImpl for UpperFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { to_upper(args, "upper") } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_upper_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_upper_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Converts a string to upper-case.") + .with_syntax_example("upper(str)") + .with_sql_example( + r#"```sql +> select upper('dataFusion'); ++---------------------------+ +| upper(Utf8("dataFusion")) | ++---------------------------+ +| DATAFUSION | ++---------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("initcap") + .with_related_udf("lower") + .build() + .unwrap() + }) } #[cfg(test)] diff --git a/datafusion/functions/src/string/uuid.rs b/datafusion/functions/src/string/uuid.rs index 3ddc320fcec1..0fbdce16ccd1 100644 --- a/datafusion/functions/src/string/uuid.rs +++ b/datafusion/functions/src/string/uuid.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::GenericStringArray; use arrow::datatypes::DataType; @@ -24,7 +24,8 @@ use arrow::datatypes::DataType::Utf8; use uuid::Uuid; use datafusion_common::{not_impl_err, Result}; -use datafusion_expr::{ColumnarValue, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; #[derive(Debug)] @@ -74,4 +75,29 @@ impl ScalarUDFImpl for UuidFunc { let array = GenericStringArray::::from_iter_values(values); Ok(ColumnarValue::Array(Arc::new(array))) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_uuid_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_uuid_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns [`UUID v4`](https://en.wikipedia.org/wiki/Universally_unique_identifier#Version_4_(random)) string value which is unique per row.") + .with_syntax_example("uuid()") + .with_sql_example(r#"```sql +> select uuid(); ++--------------------------------------+ +| uuid() | ++--------------------------------------+ +| 6ec17ef8-1934-41cc-8d59-d0c8f9eea1f0 | ++--------------------------------------+ +```"#) + .build() + .unwrap() + }) } diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs index c9dc96b2a935..bfb60bfbe259 100644 --- a/datafusion/functions/src/unicode/character_length.rs +++ b/datafusion/functions/src/unicode/character_length.rs @@ -22,9 +22,12 @@ use arrow::array::{ }; use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; use datafusion_common::Result; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; #[derive(Debug)] pub struct CharacterLengthFunc { @@ -76,6 +79,36 @@ impl ScalarUDFImpl for CharacterLengthFunc { fn aliases(&self) -> &[String] { &self.aliases } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_character_length_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_character_length_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the number of characters in a string.") + .with_syntax_example("character_length(str)") + .with_sql_example( + r#"```sql +> select character_length('Ångström'); ++------------------------------------+ +| character_length(Utf8("Ångström")) | ++------------------------------------+ +| 8 | ++------------------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .with_related_udf("bit_length") + .with_related_udf("octet_length") + .build() + .unwrap() + }) } /// Returns number of characters in the string. diff --git a/datafusion/functions/src/unicode/find_in_set.rs b/datafusion/functions/src/unicode/find_in_set.rs index 41a2b9d9e72d..cad860e41088 100644 --- a/datafusion/functions/src/unicode/find_in_set.rs +++ b/datafusion/functions/src/unicode/find_in_set.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, @@ -24,11 +24,13 @@ use arrow::array::{ }; use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; +use crate::utils::{make_scalar_function, utf8_to_int_type}; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; - -use crate::utils::{make_scalar_function, utf8_to_int_type}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct FindInSetFunc { @@ -77,6 +79,33 @@ impl ScalarUDFImpl for FindInSetFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(find_in_set, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_find_in_set_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_find_in_set_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings.") + .with_syntax_example("find_in_set(str, strlist)") + .with_sql_example(r#"```sql +> select find_in_set('b', 'a,b,c,d'); ++----------------------------------------+ +| find_in_set(Utf8("b"),Utf8("a,b,c,d")) | ++----------------------------------------+ +| 2 | ++----------------------------------------+ +```"#) + .with_argument("str", "String expression to find in strlist.") + .with_argument("strlist", "A string list is a string composed of substrings separated by , characters.") + .build() + .unwrap() + }) } ///Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs index c49784948dd0..6610cfb25e79 100644 --- a/datafusion/functions/src/unicode/left.rs +++ b/datafusion/functions/src/unicode/left.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::cmp::Ordering; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, @@ -25,15 +25,17 @@ use arrow::array::{ }; use arrow::datatypes::DataType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{ as_generic_string_array, as_int64_array, as_string_view_array, }; use datafusion_common::exec_err; use datafusion_common::Result; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; - -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct LeftFunc { @@ -91,6 +93,34 @@ impl ScalarUDFImpl for LeftFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_left_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_left_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns a specified number of characters from the left side of a string.") + .with_syntax_example("left(str, n)") + .with_sql_example(r#"```sql +> select left('datafusion', 4); ++-----------------------------------+ +| left(Utf8("datafusion"),Int64(4)) | ++-----------------------------------+ +| data | ++-----------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("n", "Number of characters to return.") + .with_related_udf("right") + .build() + .unwrap() + }) } /// Returns first n characters in the string, or when n is negative, returns all but last |n| characters. diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index e102673c4253..48bd583720aa 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::fmt::Write; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, @@ -27,13 +27,15 @@ use arrow::datatypes::DataType; use unicode_segmentation::UnicodeSegmentation; use DataType::{LargeUtf8, Utf8, Utf8View}; +use crate::string::common::StringArrayType; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; - -use crate::string::common::StringArrayType; -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct LPadFunc { @@ -95,6 +97,35 @@ impl ScalarUDFImpl for LPadFunc { other => exec_err!("Unsupported data type {other:?} for function lpad"), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_lpad_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_lpad_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Pads the left side of a string with another string to a specified string length.") + .with_syntax_example("lpad(str, n[, padding_str])") + .with_sql_example(r#"```sql +> select lpad('Dolly', 10, 'hello'); ++---------------------------------------------+ +| lpad(Utf8("Dolly"),Int64(10),Utf8("hello")) | ++---------------------------------------------+ +| helloDolly | ++---------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("n", "String length to pad to.") + .with_argument("padding_str", "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._") + .with_related_udf("rpad") + .build() + .unwrap() + }) } /// Extends the string to length 'length' by prepending the characters fill (a space by default). diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index da16d3ee3752..32872c28a613 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -16,19 +16,21 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use arrow::array::{ Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait, }; use arrow::datatypes::DataType; use datafusion_common::{exec_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; use DataType::{LargeUtf8, Utf8, Utf8View}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct ReverseFunc { signature: Signature, @@ -79,6 +81,34 @@ impl ScalarUDFImpl for ReverseFunc { } } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_reverse_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_reverse_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Reverses the character order of a string.") + .with_syntax_example("reverse(str)") + .with_sql_example( + r#"```sql +> select reverse('datafusion'); ++-----------------------------+ +| reverse(Utf8("datafusion")) | ++-----------------------------+ +| noisufatad | ++-----------------------------+ +```"#, + ) + .with_standard_argument("str", "String") + .build() + .unwrap() + }) } /// Reverses the order of the characters in the string. diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index 9d542bb2c006..585611fe60e4 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -17,7 +17,7 @@ use std::any::Any; use std::cmp::{max, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, @@ -31,8 +31,11 @@ use datafusion_common::cast::{ }; use datafusion_common::exec_err; use datafusion_common::Result; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct RightFunc { @@ -90,6 +93,34 @@ impl ScalarUDFImpl for RightFunc { ), } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_right_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_right_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns a specified number of characters from the right side of a string.") + .with_syntax_example("right(str, n)") + .with_sql_example(r#"```sql +> select right('datafusion', 6); ++------------------------------------+ +| right(Utf8("datafusion"),Int64(6)) | ++------------------------------------+ +| fusion | ++------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("n", "Number of characters to return") + .with_related_udf("left") + .build() + .unwrap() + }) } /// Returns last n characters in the string, or when n is negative, returns all but first |n| characters. diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index 05ecff05a179..9ca65e229c0c 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -47,27 +47,6 @@ impl Default for RPadFunc { } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_rpad_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder() - .with_doc_section(DOC_SECTION_STRING) - .with_description("Pads the right side of a string with another string to a specified string length.") - .with_syntax_example("rpad(str, n[, padding_str])") - .with_standard_argument( - "str", - "String", - ) - .with_argument("n", "String length to pad to.") - .with_argument("padding_str", - "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._") - .with_related_udf("lpad") - .build() - .unwrap() - }) -} - impl RPadFunc { pub fn new() -> Self { use DataType::*; @@ -143,6 +122,35 @@ impl ScalarUDFImpl for RPadFunc { } } +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_rpad_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Pads the right side of a string with another string to a specified string length.") + .with_syntax_example("rpad(str, n[, padding_str])") + .with_sql_example(r#"```sql +> select rpad('datafusion', 20, '_-'); ++-----------------------------------------------+ +| rpad(Utf8("datafusion"),Int64(20),Utf8("_-")) | ++-----------------------------------------------+ +| datafusion_-_-_-_-_- | ++-----------------------------------------------+ +```"#) + .with_standard_argument( + "str", + "String", + ) + .with_argument("n", "String length to pad to.") + .with_argument("padding_str", + "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._") + .with_related_udf("lpad") + .build() + .unwrap() + }) +} + pub fn rpad( args: &[ArrayRef], ) -> Result { diff --git a/datafusion/functions/src/unicode/strpos.rs b/datafusion/functions/src/unicode/strpos.rs index 6da67c8a2798..eaff62c338a0 100644 --- a/datafusion/functions/src/unicode/strpos.rs +++ b/datafusion/functions/src/unicode/strpos.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ArrayRef, ArrowPrimitiveType, AsArray, PrimitiveArray}; use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; @@ -24,8 +24,11 @@ use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; use crate::string::common::StringArrayType; use crate::utils::{make_scalar_function, utf8_to_int_type}; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct StrposFunc { @@ -84,6 +87,33 @@ impl ScalarUDFImpl for StrposFunc { fn aliases(&self) -> &[String] { &self.aliases } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_strpos_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_strpos_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Returns the starting position of a specified substring in a string. Positions begin at 1. If the substring does not exist in the string, the function returns 0.") + .with_syntax_example("strpos(str, substr)") + .with_sql_example(r#"```sql +> select strpos('datafusion', 'fus'); ++----------------------------------------+ +| strpos(Utf8("datafusion"),Utf8("fus")) | ++----------------------------------------+ +| 5 | ++----------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("substr", "Substring expression to search for.") + .build() + .unwrap() + }) } fn strpos(args: &[ArrayRef]) -> Result { diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 205de0b30b9c..c253ef7e03e9 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use crate::string::common::{make_and_append_view, StringArrayType}; use crate::utils::{make_scalar_function, utf8_to_str_type}; @@ -28,7 +28,10 @@ use arrow::datatypes::DataType; use arrow_buffer::{NullBufferBuilder, ScalarBuffer}; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, plan_err, Result}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct SubstrFunc { @@ -138,6 +141,34 @@ impl ScalarUDFImpl for SubstrFunc { ]) } } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_substr_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_substr_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Extracts a substring of a specified number of characters from a specific starting position in a string.") + .with_syntax_example("substr(str, start_pos[, length])") + .with_sql_example(r#"```sql +> select substr('datafusion', 5, 3); ++----------------------------------------------+ +| substr(Utf8("datafusion"),Int64(5),Int64(3)) | ++----------------------------------------------+ +| fus | ++----------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("start_pos", "Character position to start the substring at. The first character in the string has a position of 1.") + .with_argument("length", "Number of characters to extract. If not specified, returns the rest of the string after the start position.") + .build() + .unwrap() + }) } /// Extracts the substring of string starting at the start'th character, and extending for count characters if that is specified. (Same as substring(string from start for count).) diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs index 6591ee26403a..436d554a49f7 100644 --- a/datafusion/functions/src/unicode/substrindex.rs +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, @@ -24,11 +24,13 @@ use arrow::array::{ }; use arrow::datatypes::{DataType, Int32Type, Int64Type}; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; - -use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct SubstrIndexFunc { @@ -83,6 +85,42 @@ impl ScalarUDFImpl for SubstrIndexFunc { fn aliases(&self) -> &[String] { &self.aliases } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_substr_index_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_substr_index_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description(r#"Returns the substring from str before count occurrences of the delimiter delim. +If count is positive, everything to the left of the final delimiter (counting from the left) is returned. +If count is negative, everything to the right of the final delimiter (counting from the right) is returned."#) + .with_syntax_example("substr_index(str, delim, count)") + .with_sql_example(r#"```sql +> select substr_index('www.apache.org', '.', 1); ++---------------------------------------------------------+ +| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(1)) | ++---------------------------------------------------------+ +| www | ++---------------------------------------------------------+ +> select substr_index('www.apache.org', '.', -1); ++----------------------------------------------------------+ +| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(-1)) | ++----------------------------------------------------------+ +| org | ++----------------------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("delim", "The string to find in str to split str.") + .with_argument("count", "The number of times to search for the delimiter. Can be either a positive or negative number.") + .build() + .unwrap() + }) } /// Returns the substring from str before count occurrences of the delimiter delim. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index a42b9c6cb857..cbee9a6fe1f2 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -16,7 +16,7 @@ // under the License. use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use arrow::array::{ ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait, @@ -27,8 +27,11 @@ use unicode_segmentation::UnicodeSegmentation; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; +use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; use datafusion_expr::TypeSignature::Exact; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; #[derive(Debug)] pub struct TranslateFunc { @@ -76,6 +79,34 @@ impl ScalarUDFImpl for TranslateFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { make_scalar_function(invoke_translate, vec![])(args) } + + fn documentation(&self) -> Option<&Documentation> { + Some(get_translate_doc()) + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +fn get_translate_doc() -> &'static Documentation { + DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_STRING) + .with_description("Translates characters in a string to specified translation characters.") + .with_syntax_example("translate(str, chars, translation)") + .with_sql_example(r#"```sql +> select translate('twice', 'wic', 'her'); ++--------------------------------------------------+ +| translate(Utf8("twice"),Utf8("wic"),Utf8("her")) | ++--------------------------------------------------+ +| there | ++--------------------------------------------------+ +```"#) + .with_standard_argument("str", "String") + .with_argument("chars", "Characters to translate.") + .with_argument("translation", "Translation characters. Translation characters replace only characters at the same position in the **chars** string.") + .build() + .unwrap() + }) } fn invoke_translate(args: &[ArrayRef]) -> Result { diff --git a/docs/source/user-guide/sql/aggregate_functions_new.md b/docs/source/user-guide/sql/aggregate_functions_new.md index 213894d7da06..236ef57da502 100644 --- a/docs/source/user-guide/sql/aggregate_functions_new.md +++ b/docs/source/user-guide/sql/aggregate_functions_new.md @@ -90,8 +90,9 @@ var(expression) - **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. -#### Aliases- var_sample +#### Aliases +- var_sample - var_samp ### `var_pop` @@ -106,50 +107,18 @@ var_pop(expression) - **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. -#### Aliases- var_population +#### Aliases -### `var_pop` - -Returns the statistical population variance of a set of numbers. - -``` -var_pop(expression) -``` - -#### Arguments - -- **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. - -#### Aliases- var_population - -### `var` - -Returns the statistical sample variance of a set of numbers. - -``` -var(expression) -``` - -#### Arguments - -- **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. - -#### Aliases- var_sample - -- var_samp +- var_population -### `var` +### `var_population` -Returns the statistical sample variance of a set of numbers. +_Alias of [var_pop](#var_pop)._ -``` -var(expression) -``` +### `var_samp` -#### Arguments - -- **expression**: Numeric expression to operate on. Can be a constant, column, or function, and any combination of operators. +_Alias of [var](#var)._ -#### Aliases- var_sample +### `var_sample` -- var_samp +_Alias of [var](#var)._ diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 469fb705b71f..3e481db90c22 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -644,618 +644,7 @@ _Alias of [nvl](#nvl)._ ## String Functions -- [ascii](#ascii) -- [bit_length](#bit_length) -- [btrim](#btrim) -- [char_length](#char_length) -- [character_length](#character_length) -- [concat](#concat) -- [concat_ws](#concat_ws) -- [chr](#chr) -- [ends_with](#ends_with) -- [initcap](#initcap) -- [instr](#instr) -- [left](#left) -- [length](#length) -- [lower](#lower) -- [lpad](#lpad) -- [ltrim](#ltrim) -- [octet_length](#octet_length) -- [repeat](#repeat) -- [replace](#replace) -- [reverse](#reverse) -- [right](#right) -- [rpad](#rpad) -- [rtrim](#rtrim) -- [split_part](#split_part) -- [starts_with](#starts_with) -- [strpos](#strpos) -- [substr](#substr) -- [to_hex](#to_hex) -- [translate](#translate) -- [trim](#trim) -- [upper](#upper) -- [uuid](#uuid) -- [overlay](#overlay) -- [levenshtein](#levenshtein) -- [substr_index](#substr_index) -- [find_in_set](#find_in_set) -- [position](#position) -- [contains](#contains) - -### `ascii` - -Returns the ASCII value of the first character in a string. - -``` -ascii(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[chr](#chr) - -### `bit_length` - -Returns the bit length of a string. - -``` -bit_length(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[length](#length), -[octet_length](#octet_length) - -### `btrim` - -Trims the specified trim string from the start and end of a string. -If no trim string is provided, all whitespace is removed from the start and end -of the input string. - -``` -btrim(str[, trim_str]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **trim_str**: String expression to trim from the beginning and end of the input string. - Can be a constant, column, or function, and any combination of arithmetic operators. - _Default is whitespace characters._ - -**Related functions**: -[ltrim](#ltrim), -[rtrim](#rtrim) - -#### Aliases - -- trim - -### `char_length` - -_Alias of [length](#length)._ - -### `character_length` - -_Alias of [length](#length)._ - -### `concat` - -Concatenates multiple strings together. - -``` -concat(str[, ..., str_n]) -``` - -#### Arguments - -- **str**: String expression to concatenate. - Can be a constant, column, or function, and any combination of string operators. -- **str_n**: Subsequent string column or literal string to concatenate. - -**Related functions**: -[concat_ws](#concat_ws) - -### `concat_ws` - -Concatenates multiple strings together with a specified separator. - -``` -concat_ws(separator, str[, ..., str_n]) -``` - -#### Arguments - -- **separator**: Separator to insert between concatenated strings. -- **str**: String expression to concatenate. - Can be a constant, column, or function, and any combination of string operators. -- **str_n**: Subsequent string column or literal string to concatenate. - -**Related functions**: -[concat](#concat) - -### `chr` - -Returns the character with the specified ASCII or Unicode code value. - -``` -chr(expression) -``` - -#### Arguments - -- **expression**: Expression containing the ASCII or Unicode code value to operate on. - Can be a constant, column, or function, and any combination of arithmetic or - string operators. - -**Related functions**: -[ascii](#ascii) - -### `ends_with` - -Tests if a string ends with a substring. - -``` -ends_with(str, substr) -``` - -#### Arguments - -- **str**: String expression to test. - Can be a constant, column, or function, and any combination of string operators. -- **substr**: Substring to test for. - -### `initcap` - -Capitalizes the first character in each word in the input string. -Words are delimited by non-alphanumeric characters. - -``` -initcap(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[lower](#lower), -[upper](#upper) - -### `instr` - -_Alias of [strpos](#strpos)._ - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **substr**: Substring expression to search for. - Can be a constant, column, or function, and any combination of string operators. - -### `left` - -Returns a specified number of characters from the left side of a string. - -``` -left(str, n) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **n**: Number of characters to return. - -**Related functions**: -[right](#right) - -### `length` - -Returns the number of characters in a string. - -``` -length(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -#### Aliases - -- char_length -- character_length - -**Related functions**: -[bit_length](#bit_length), -[octet_length](#octet_length) - -### `lower` - -Converts a string to lower-case. - -``` -lower(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[initcap](#initcap), -[upper](#upper) - -### `lpad` - -Pads the left side of a string with another string to a specified string length. - -``` -lpad(str, n[, padding_str]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **n**: String length to pad to. -- **padding_str**: String expression to pad with. - Can be a constant, column, or function, and any combination of string operators. - _Default is a space._ - -**Related functions**: -[rpad](#rpad) - -### `ltrim` - -Trims the specified trim string from the beginning of a string. -If no trim string is provided, all whitespace is removed from the start -of the input string. - -``` -ltrim(str[, trim_str]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **trim_str**: String expression to trim from the beginning of the input string. - Can be a constant, column, or function, and any combination of arithmetic operators. - _Default is whitespace characters._ - -**Related functions**: -[btrim](#btrim), -[rtrim](#rtrim) - -### `octet_length` - -Returns the length of a string in bytes. - -``` -octet_length(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[bit_length](#bit_length), -[length](#length) - -### `repeat` - -Returns a string with an input string repeated a specified number. - -``` -repeat(str, n) -``` - -#### Arguments - -- **str**: String expression to repeat. - Can be a constant, column, or function, and any combination of string operators. -- **n**: Number of times to repeat the input string. - -### `replace` - -Replaces all occurrences of a specified substring in a string with a new substring. - -``` -replace(str, substr, replacement) -``` - -#### Arguments - -- **str**: String expression to repeat. - Can be a constant, column, or function, and any combination of string operators. -- **substr**: Substring expression to replace in the input string. - Can be a constant, column, or function, and any combination of string operators. -- **replacement**: Replacement substring expression. - Can be a constant, column, or function, and any combination of string operators. - -### `reverse` - -Reverses the character order of a string. - -``` -reverse(str) -``` - -#### Arguments - -- **str**: String expression to repeat. - Can be a constant, column, or function, and any combination of string operators. - -### `right` - -Returns a specified number of characters from the right side of a string. - -``` -right(str, n) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **n**: Number of characters to return. - -**Related functions**: -[left](#left) - -### `rpad` - -Pads the right side of a string with another string to a specified string length. - -``` -rpad(str, n[, padding_str]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **n**: String length to pad to. -- **padding_str**: String expression to pad with. - Can be a constant, column, or function, and any combination of string operators. - _Default is a space._ - -**Related functions**: -[lpad](#lpad) - -### `rtrim` - -Trims the specified trim string from the end of a string. -If no trim string is provided, all whitespace is removed from the end -of the input string. - -``` -rtrim(str[, trim_str]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **trim_str**: String expression to trim from the end of the input string. - Can be a constant, column, or function, and any combination of arithmetic operators. - _Default is whitespace characters._ - -**Related functions**: -[btrim](#btrim), -[ltrim](#ltrim) - -### `split_part` - -Splits a string based on a specified delimiter and returns the substring in the -specified position. - -``` -split_part(str, delimiter, pos) -``` - -#### Arguments - -- **str**: String expression to spit. - Can be a constant, column, or function, and any combination of string operators. -- **delimiter**: String or character to split on. -- **pos**: Position of the part to return. - -### `starts_with` - -Tests if a string starts with a substring. - -``` -starts_with(str, substr) -``` - -#### Arguments - -- **str**: String expression to test. - Can be a constant, column, or function, and any combination of string operators. -- **substr**: Substring to test for. - -### `strpos` - -Returns the starting position of a specified substring in a string. -Positions begin at 1. -If the substring does not exist in the string, the function returns 0. - -``` -strpos(str, substr) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **substr**: Substring expression to search for. - Can be a constant, column, or function, and any combination of string operators. - -#### Aliases - -- instr - -### `substr` - -Extracts a substring of a specified number of characters from a specific -starting position in a string. - -``` -substr(str, start_pos[, length]) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **start_pos**: Character position to start the substring at. - The first character in the string has a position of 1. -- **length**: Number of characters to extract. - If not specified, returns the rest of the string after the start position. - -#### Aliases - -- substring - -### `substring` - -_Alias of [substr](#substr)._ - -### `translate` - -Translates characters in a string to specified translation characters. - -``` -translate(str, chars, translation) -``` - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. -- **chars**: Characters to translate. -- **translation**: Translation characters. Translation characters replace only - characters at the same position in the **chars** string. - -### `to_hex` - -Converts an integer to a hexadecimal string. - -``` -to_hex(int) -``` - -#### Arguments - -- **int**: Integer expression to convert. - Can be a constant, column, or function, and any combination of arithmetic operators. - -### `trim` - -_Alias of [btrim](#btrim)._ - -### `upper` - -Converts a string to upper-case. - -``` -upper(str) -``` - -#### Arguments - -- **str**: String expression to operate on. - Can be a constant, column, or function, and any combination of string operators. - -**Related functions**: -[initcap](#initcap), -[lower](#lower) - -### `uuid` - -Returns UUID v4 string value which is unique per row. - -``` -uuid() -``` - -### `overlay` - -Returns the string which is replaced by another string from the specified position and specified count length. -For example, `overlay('Txxxxas' placing 'hom' from 2 for 4) → Thomas` - -``` -overlay(str PLACING substr FROM pos [FOR count]) -``` - -#### Arguments - -- **str**: String expression to operate on. -- **substr**: the string to replace part of str. -- **pos**: the start position to replace of str. -- **count**: the count of characters to be replaced from start position of str. If not specified, will use substr length instead. - -### `levenshtein` - -Returns the Levenshtein distance between the two given strings. -For example, `levenshtein('kitten', 'sitting') = 3` - -``` -levenshtein(str1, str2) -``` - -#### Arguments - -- **str1**: String expression to compute Levenshtein distance with str2. -- **str2**: String expression to compute Levenshtein distance with str1. - -### `substr_index` - -Returns the substring from str before count occurrences of the delimiter delim. -If count is positive, everything to the left of the final delimiter (counting from the left) is returned. -If count is negative, everything to the right of the final delimiter (counting from the right) is returned. -For example, `substr_index('www.apache.org', '.', 1) = www`, `substr_index('www.apache.org', '.', -1) = org` - -``` -substr_index(str, delim, count) -``` - -#### Arguments - -- **str**: String expression to operate on. -- **delim**: the string to find in str to split str. -- **count**: The number of times to search for the delimiter. Can be both a positive or negative number. - -### `find_in_set` - -Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings. -For example, `find_in_set('b', 'a,b,c,d') = 2` - -``` -find_in_set(str, strlist) -``` - -#### Arguments - -- **str**: String expression to find in strlist. -- **strlist**: A string list is a string composed of substrings separated by , characters. +See the new documentation [`here`](https://datafusion.apache.org/user-guide/sql/scalar_functions_new.html) ## Binary String Functions @@ -1452,19 +841,6 @@ position(substr in origstr) - **substr**: The pattern string. - **origstr**: The model string. -### `contains` - -Return true if search_string is found within string (case-sensitive). - -``` -contains(string, search_string) -``` - -#### Arguments - -- **string**: The pattern string. -- **search_string**: The model string. - ## Time and Date Functions - [now](#now) diff --git a/docs/source/user-guide/sql/scalar_functions_new.md b/docs/source/user-guide/sql/scalar_functions_new.md index bff2c0f485c3..2423f9c4757d 100644 --- a/docs/source/user-guide/sql/scalar_functions_new.md +++ b/docs/source/user-guide/sql/scalar_functions_new.md @@ -66,11 +66,48 @@ coalesce(expression1[, ..., expression_n]) ## String Functions - [ascii](#ascii) +- [bit_length](#bit_length) +- [btrim](#btrim) +- [char_length](#char_length) +- [character_length](#character_length) +- [chr](#chr) +- [concat](#concat) +- [concat_ws](#concat_ws) +- [contains](#contains) +- [ends_with](#ends_with) +- [find_in_set](#find_in_set) +- [initcap](#initcap) +- [instr](#instr) +- [left](#left) +- [length](#length) +- [levenshtein](#levenshtein) +- [lower](#lower) +- [lpad](#lpad) +- [ltrim](#ltrim) +- [octet_length](#octet_length) +- [position](#position) +- [repeat](#repeat) +- [replace](#replace) +- [reverse](#reverse) +- [right](#right) - [rpad](#rpad) +- [rtrim](#rtrim) +- [split_part](#split_part) +- [starts_with](#starts_with) +- [strpos](#strpos) +- [substr](#substr) +- [substr_index](#substr_index) +- [substring](#substring) +- [substring_index](#substring_index) +- [to_hex](#to_hex) +- [translate](#translate) +- [trim](#trim) +- [upper](#upper) +- [uuid](#uuid) ### `ascii` -Returns the ASCII value of the first character in a string. +Returns the Unicode character code of the first character in a string. ``` ascii(str) @@ -78,12 +115,601 @@ ascii(str) #### Arguments -- **str**: String expression to operate on. Can be a constant, column, or function that evaluates to or can be coerced to a Utf8, LargeUtf8 or a Utf8View. +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select ascii('abc'); ++--------------------+ +| ascii(Utf8("abc")) | ++--------------------+ +| 97 | ++--------------------+ +> select ascii('🚀'); ++-------------------+ +| ascii(Utf8("🚀")) | ++-------------------+ +| 128640 | ++-------------------+ +``` **Related functions**: - [chr](#chr) +### `bit_length` + +Returns the bit length of a string. + +``` +bit_length(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select bit_length('datafusion'); ++--------------------------------+ +| bit_length(Utf8("datafusion")) | ++--------------------------------+ +| 80 | ++--------------------------------+ +``` + +**Related functions**: + +- [length](#length) +- [octet_length](#octet_length) + +### `btrim` + +Trims the specified trim string from the start and end of a string. If no trim string is provided, all whitespace is removed from the start and end of the input string. + +``` +btrim(str[, trim_str]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **trim_str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. _Default is whitespace characters._ + +#### Example + +```sql +> select btrim('__datafusion____', '_'); ++-------------------------------------------+ +| btrim(Utf8("__datafusion____"),Utf8("_")) | ++-------------------------------------------+ +| datafusion | ++-------------------------------------------+ +``` + +#### Aliases + +- trim + +**Related functions**: + +- [ltrim](#ltrim) +- [rtrim](#rtrim) + +### `char_length` + +_Alias of [character_length](#character_length)._ + +### `character_length` + +Returns the number of characters in a string. + +``` +character_length(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select character_length('Ångström'); ++------------------------------------+ +| character_length(Utf8("Ångström")) | ++------------------------------------+ +| 8 | ++------------------------------------+ +``` + +#### Aliases + +- length +- char_length + +**Related functions**: + +- [bit_length](#bit_length) +- [octet_length](#octet_length) + +### `chr` + +Returns the character with the specified ASCII or Unicode code value. + +``` +chr(expression) +``` + +#### Arguments + +- **expression**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select chr(128640); ++--------------------+ +| chr(Int64(128640)) | ++--------------------+ +| 🚀 | ++--------------------+ +``` + +**Related functions**: + +- [ascii](#ascii) + +### `concat` + +Concatenates multiple strings together. + +``` +concat(str[, ..., str_n]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **str_n**: Subsequent string expressions to concatenate. + +#### Example + +```sql +> select concat('data', 'f', 'us', 'ion'); ++-------------------------------------------------------+ +| concat(Utf8("data"),Utf8("f"),Utf8("us"),Utf8("ion")) | ++-------------------------------------------------------+ +| datafusion | ++-------------------------------------------------------+ +``` + +**Related functions**: + +- [concat_ws](#concat_ws) + +### `concat_ws` + +Concatenates multiple strings together with a specified separator. + +``` +concat_ws(separator, str[, ..., str_n]) +``` + +#### Arguments + +- **separator**: Separator to insert between concatenated strings. +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **str_n**: Subsequent string expressions to concatenate. expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select concat_ws('_', 'data', 'fusion'); ++--------------------------------------------------+ +| concat_ws(Utf8("_"),Utf8("data"),Utf8("fusion")) | ++--------------------------------------------------+ +| data_fusion | ++--------------------------------------------------+ +``` + +**Related functions**: + +- [concat](#concat) + +### `contains` + +Return true if search_str is found within string (case-sensitive). + +``` +contains(str, search_str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **search_str**: The string to search for in str. + +#### Example + +```sql +> select contains('the quick brown fox', 'row'); ++---------------------------------------------------+ +| contains(Utf8("the quick brown fox"),Utf8("row")) | ++---------------------------------------------------+ +| true | ++---------------------------------------------------+ +``` + +### `ends_with` + +Tests if a string ends with a substring. + +``` +ends_with(str, substr) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **substr**: Substring to test for. + +#### Example + +```sql +> select ends_with('datafusion', 'soin'); ++--------------------------------------------+ +| ends_with(Utf8("datafusion"),Utf8("soin")) | ++--------------------------------------------+ +| false | ++--------------------------------------------+ +> select ends_with('datafusion', 'sion'); ++--------------------------------------------+ +| ends_with(Utf8("datafusion"),Utf8("sion")) | ++--------------------------------------------+ +| true | ++--------------------------------------------+ +``` + +### `find_in_set` + +Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings. + +``` +find_in_set(str, strlist) +``` + +#### Arguments + +- **str**: String expression to find in strlist. +- **strlist**: A string list is a string composed of substrings separated by , characters. + +#### Example + +```sql +> select find_in_set('b', 'a,b,c,d'); ++----------------------------------------+ +| find_in_set(Utf8("b"),Utf8("a,b,c,d")) | ++----------------------------------------+ +| 2 | ++----------------------------------------+ +``` + +### `initcap` + +Capitalizes the first character in each word in the input string. Words are delimited by non-alphanumeric characters. + +``` +initcap(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select initcap('apache datafusion'); ++------------------------------------+ +| initcap(Utf8("apache datafusion")) | ++------------------------------------+ +| Apache Datafusion | ++------------------------------------+ +``` + +**Related functions**: + +- [lower](#lower) +- [upper](#upper) + +### `instr` + +_Alias of [strpos](#strpos)._ + +### `left` + +Returns a specified number of characters from the left side of a string. + +``` +left(str, n) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **n**: Number of characters to return. + +#### Example + +```sql +> select left('datafusion', 4); ++-----------------------------------+ +| left(Utf8("datafusion"),Int64(4)) | ++-----------------------------------+ +| data | ++-----------------------------------+ +``` + +**Related functions**: + +- [right](#right) + +### `length` + +_Alias of [character_length](#character_length)._ + +### `levenshtein` + +Returns the [`Levenshtein distance`](https://en.wikipedia.org/wiki/Levenshtein_distance) between the two given strings. + +``` +levenshtein(str1, str2) +``` + +#### Arguments + +- **str1**: String expression to compute Levenshtein distance with str2. +- **str2**: String expression to compute Levenshtein distance with str1. + +#### Example + +```sql +> select levenshtein('kitten', 'sitting'); ++---------------------------------------------+ +| levenshtein(Utf8("kitten"),Utf8("sitting")) | ++---------------------------------------------+ +| 3 | ++---------------------------------------------+ +``` + +### `lower` + +Converts a string to lower-case. + +``` +lower(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select lower('Ångström'); ++-------------------------+ +| lower(Utf8("Ångström")) | ++-------------------------+ +| ångström | ++-------------------------+ +``` + +**Related functions**: + +- [initcap](#initcap) +- [upper](#upper) + +### `lpad` + +Pads the left side of a string with another string to a specified string length. + +``` +lpad(str, n[, padding_str]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **n**: String length to pad to. +- **padding_str**: Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._ + +#### Example + +```sql +> select lpad('Dolly', 10, 'hello'); ++---------------------------------------------+ +| lpad(Utf8("Dolly"),Int64(10),Utf8("hello")) | ++---------------------------------------------+ +| helloDolly | ++---------------------------------------------+ +``` + +**Related functions**: + +- [rpad](#rpad) + +### `ltrim` + +Trims the specified trim string from the beginning of a string. If no trim string is provided, all whitespace is removed from the start of the input string. + +``` +ltrim(str[, trim_str]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **trim_str**: String expression to trim from the beginning of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._ + +#### Example + +```sql +> select ltrim(' datafusion '); ++-------------------------------+ +| ltrim(Utf8(" datafusion ")) | ++-------------------------------+ +| datafusion | ++-------------------------------+ +> select ltrim('___datafusion___', '_'); ++-------------------------------------------+ +| ltrim(Utf8("___datafusion___"),Utf8("_")) | ++-------------------------------------------+ +| datafusion___ | ++-------------------------------------------+ +``` + +**Related functions**: + +- [btrim](#btrim) +- [rtrim](#rtrim) + +### `octet_length` + +Returns the length of a string in bytes. + +``` +octet_length(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select octet_length('Ångström'); ++--------------------------------+ +| octet_length(Utf8("Ångström")) | ++--------------------------------+ +| 10 | ++--------------------------------+ +``` + +**Related functions**: + +- [bit_length](#bit_length) +- [length](#length) + +### `position` + +_Alias of [strpos](#strpos)._ + +### `repeat` + +Returns a string with an input string repeated a specified number. + +``` +repeat(str, n) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **n**: Number of times to repeat the input string. + +#### Example + +```sql +> select repeat('data', 3); ++-------------------------------+ +| repeat(Utf8("data"),Int64(3)) | ++-------------------------------+ +| datadatadata | ++-------------------------------+ +``` + +### `replace` + +Replaces all occurrences of a specified substring in a string with a new substring. + +``` +replace(str, substr, replacement) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **substr**: Substring expression to replace in the input string. Substring expression expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **replacement**: Replacement substring expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select replace('ABabbaBA', 'ab', 'cd'); ++-------------------------------------------------+ +| replace(Utf8("ABabbaBA"),Utf8("ab"),Utf8("cd")) | ++-------------------------------------------------+ +| ABcdbaBA | ++-------------------------------------------------+ +``` + +### `reverse` + +Reverses the character order of a string. + +``` +reverse(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select reverse('datafusion'); ++-----------------------------+ +| reverse(Utf8("datafusion")) | ++-----------------------------+ +| noisufatad | ++-----------------------------+ +``` + +### `right` + +Returns a specified number of characters from the right side of a string. + +``` +right(str, n) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **n**: Number of characters to return + +#### Example + +```sql +> select right('datafusion', 6); ++------------------------------------+ +| right(Utf8("datafusion"),Int64(6)) | ++------------------------------------+ +| fusion | ++------------------------------------+ +``` + +**Related functions**: + +- [left](#left) + ### `rpad` Pads the right side of a string with another string to a specified string length. @@ -98,10 +724,307 @@ rpad(str, n[, padding_str]) - **n**: String length to pad to. - **padding_str**: String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._ +#### Example + +```sql +> select rpad('datafusion', 20, '_-'); ++-----------------------------------------------+ +| rpad(Utf8("datafusion"),Int64(20),Utf8("_-")) | ++-----------------------------------------------+ +| datafusion_-_-_-_-_- | ++-----------------------------------------------+ +``` + **Related functions**: - [lpad](#lpad) +### `rtrim` + +Trims the specified trim string from the end of a string. If no trim string is provided, all whitespace is removed from the end of the input string. + +``` +rtrim(str[, trim_str]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **trim_str**: String expression to trim from the end of the input string. Can be a constant, column, or function, and any combination of arithmetic operators. _Default is whitespace characters._ + +#### Example + +```sql +> select rtrim(' datafusion '); ++-------------------------------+ +| rtrim(Utf8(" datafusion ")) | ++-------------------------------+ +| datafusion | ++-------------------------------+ +> select rtrim('___datafusion___', '_'); ++-------------------------------------------+ +| rtrim(Utf8("___datafusion___"),Utf8("_")) | ++-------------------------------------------+ +| ___datafusion | ++-------------------------------------------+ +``` + +**Related functions**: + +- [btrim](#btrim) +- [ltrim](#ltrim) + +### `split_part` + +Splits a string based on a specified delimiter and returns the substring in the specified position. + +``` +split_part(str, delimiter, pos) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **delimiter**: String or character to split on. +- **pos**: Position of the part to return. + +#### Example + +```sql +> select split_part('1.2.3.4.5', '.', 3); ++--------------------------------------------------+ +| split_part(Utf8("1.2.3.4.5"),Utf8("."),Int64(3)) | ++--------------------------------------------------+ +| 3 | ++--------------------------------------------------+ +``` + +### `starts_with` + +Tests if a string starts with a substring. + +``` +starts_with(str, substr) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **substr**: Substring to test for. + +#### Example + +```sql +> select starts_with('datafusion','data'); ++----------------------------------------------+ +| starts_with(Utf8("datafusion"),Utf8("data")) | ++----------------------------------------------+ +| true | ++----------------------------------------------+ +``` + +### `strpos` + +Returns the starting position of a specified substring in a string. Positions begin at 1. If the substring does not exist in the string, the function returns 0. + +``` +strpos(str, substr) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **substr**: Substring expression to search for. + +#### Example + +```sql +> select strpos('datafusion', 'fus'); ++----------------------------------------+ +| strpos(Utf8("datafusion"),Utf8("fus")) | ++----------------------------------------+ +| 5 | ++----------------------------------------+ +``` + +#### Aliases + +- instr +- position + +### `substr` + +Extracts a substring of a specified number of characters from a specific starting position in a string. + +``` +substr(str, start_pos[, length]) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **start_pos**: Character position to start the substring at. The first character in the string has a position of 1. +- **length**: Number of characters to extract. If not specified, returns the rest of the string after the start position. + +#### Example + +```sql +> select substr('datafusion', 5, 3); ++----------------------------------------------+ +| substr(Utf8("datafusion"),Int64(5),Int64(3)) | ++----------------------------------------------+ +| fus | ++----------------------------------------------+ +``` + +#### Aliases + +- substring + +### `substr_index` + +Returns the substring from str before count occurrences of the delimiter delim. +If count is positive, everything to the left of the final delimiter (counting from the left) is returned. +If count is negative, everything to the right of the final delimiter (counting from the right) is returned. + +``` +substr_index(str, delim, count) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **delim**: The string to find in str to split str. +- **count**: The number of times to search for the delimiter. Can be either a positive or negative number. + +#### Example + +```sql +> select substr_index('www.apache.org', '.', 1); ++---------------------------------------------------------+ +| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(1)) | ++---------------------------------------------------------+ +| www | ++---------------------------------------------------------+ +> select substr_index('www.apache.org', '.', -1); ++----------------------------------------------------------+ +| substr_index(Utf8("www.apache.org"),Utf8("."),Int64(-1)) | ++----------------------------------------------------------+ +| org | ++----------------------------------------------------------+ +``` + +#### Aliases + +- substring_index + +### `substring` + +_Alias of [substr](#substr)._ + +### `substring_index` + +_Alias of [substr_index](#substr_index)._ + +### `to_hex` + +Converts an integer to a hexadecimal string. + +``` +to_hex(int) +``` + +#### Arguments + +- **int**: Integer expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select to_hex(12345689); ++-------------------------+ +| to_hex(Int64(12345689)) | ++-------------------------+ +| bc6159 | ++-------------------------+ +``` + +### `translate` + +Translates characters in a string to specified translation characters. + +``` +translate(str, chars, translation) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. +- **chars**: Characters to translate. +- **translation**: Translation characters. Translation characters replace only characters at the same position in the **chars** string. + +#### Example + +```sql +> select translate('twice', 'wic', 'her'); ++--------------------------------------------------+ +| translate(Utf8("twice"),Utf8("wic"),Utf8("her")) | ++--------------------------------------------------+ +| there | ++--------------------------------------------------+ +``` + +### `trim` + +_Alias of [btrim](#btrim)._ + +### `upper` + +Converts a string to upper-case. + +``` +upper(str) +``` + +#### Arguments + +- **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. + +#### Example + +```sql +> select upper('dataFusion'); ++---------------------------+ +| upper(Utf8("dataFusion")) | ++---------------------------+ +| DATAFUSION | ++---------------------------+ +``` + +**Related functions**: + +- [initcap](#initcap) +- [lower](#lower) + +### `uuid` + +Returns [`UUID v4`]() string value which is unique per row. + +``` +uuid() +``` + +#### Example + +```sql +> select uuid(); ++--------------------------------------+ +| uuid() | ++--------------------------------------+ +| 6ec17ef8-1934-41cc-8d59-d0c8f9eea1f0 | ++--------------------------------------+ +``` + ## Binary String Functions - [decode](#decode) From 577e4bba0f5838846862621e1f5318c949cff2cb Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 7 Oct 2024 13:29:15 -0400 Subject: [PATCH 22/22] Account for constant equivalence properties in union, tests (#12562) --- .../physical-expr-common/src/sort_expr.rs | 7 + .../physical-expr/src/equivalence/class.rs | 50 +- .../src/equivalence/properties.rs | 433 ++++++++++++++---- 3 files changed, 409 insertions(+), 81 deletions(-) diff --git a/datafusion/physical-expr-common/src/sort_expr.rs b/datafusion/physical-expr-common/src/sort_expr.rs index 704cb291335f..6c4bf156ce56 100644 --- a/datafusion/physical-expr-common/src/sort_expr.rs +++ b/datafusion/physical-expr-common/src/sort_expr.rs @@ -120,6 +120,13 @@ impl PhysicalSortExpr { } } +/// Access the PhysicalSortExpr as a PhysicalExpr +impl AsRef for PhysicalSortExpr { + fn as_ref(&self) -> &(dyn PhysicalExpr + 'static) { + self.expr.as_ref() + } +} + impl PartialEq for PhysicalSortExpr { fn eq(&self, other: &PhysicalSortExpr) -> bool { self.options == other.options && self.expr.eq(&other.expr) diff --git a/datafusion/physical-expr/src/equivalence/class.rs b/datafusion/physical-expr/src/equivalence/class.rs index 00708b4540aa..c1851ddb22b5 100644 --- a/datafusion/physical-expr/src/equivalence/class.rs +++ b/datafusion/physical-expr/src/equivalence/class.rs @@ -30,7 +30,6 @@ use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::JoinType; use datafusion_physical_expr_common::physical_expr::format_physical_expr_list; -#[derive(Debug, Clone)] /// A structure representing a expression known to be constant in a physical execution plan. /// /// The `ConstExpr` struct encapsulates an expression that is constant during the execution @@ -41,9 +40,10 @@ use datafusion_physical_expr_common::physical_expr::format_physical_expr_list; /// /// - `expr`: Constant expression for a node in the physical plan. /// -/// - `across_partitions`: A boolean flag indicating whether the constant expression is -/// valid across partitions. If set to `true`, the constant expression has same value for all partitions. -/// If set to `false`, the constant expression may have different values for different partitions. +/// - `across_partitions`: A boolean flag indicating whether the constant +/// expression is the same across partitions. If set to `true`, the constant +/// expression has same value for all partitions. If set to `false`, the +/// constant expression may have different values for different partitions. /// /// # Example /// @@ -56,11 +56,22 @@ use datafusion_physical_expr_common::physical_expr::format_physical_expr_list; /// // create a constant expression from a physical expression /// let const_expr = ConstExpr::from(col); /// ``` +#[derive(Debug, Clone)] pub struct ConstExpr { + /// The expression that is known to be constant (e.g. a `Column`) expr: Arc, + /// Does the constant have the same value across all partitions? See + /// struct docs for more details across_partitions: bool, } +impl PartialEq for ConstExpr { + fn eq(&self, other: &Self) -> bool { + self.across_partitions == other.across_partitions + && self.expr.eq(other.expr.as_any()) + } +} + impl ConstExpr { /// Create a new constant expression from a physical expression. /// @@ -74,11 +85,17 @@ impl ConstExpr { } } + /// Set the `across_partitions` flag + /// + /// See struct docs for more details pub fn with_across_partitions(mut self, across_partitions: bool) -> Self { self.across_partitions = across_partitions; self } + /// Is the expression the same across all partitions? + /// + /// See struct docs for more details pub fn across_partitions(&self) -> bool { self.across_partitions } @@ -101,6 +118,31 @@ impl ConstExpr { across_partitions: self.across_partitions, }) } + + /// Returns true if this constant expression is equal to the given expression + pub fn eq_expr(&self, other: impl AsRef) -> bool { + self.expr.eq(other.as_ref().as_any()) + } + + /// Returns a [`Display`]able list of `ConstExpr`. + pub fn format_list(input: &[ConstExpr]) -> impl Display + '_ { + struct DisplayableList<'a>(&'a [ConstExpr]); + impl<'a> Display for DisplayableList<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + let mut first = true; + for const_expr in self.0 { + if first { + first = false; + } else { + write!(f, ",")?; + } + write!(f, "{}", const_expr)?; + } + Ok(()) + } + } + DisplayableList(input) + } } /// Display implementation for `ConstExpr` diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs index 1c12f4697005..4ab0cac58257 100644 --- a/datafusion/physical-expr/src/equivalence/properties.rs +++ b/datafusion/physical-expr/src/equivalence/properties.rs @@ -17,6 +17,8 @@ use std::fmt::Display; use std::hash::{Hash, Hasher}; +use std::iter::Peekable; +use std::slice::Iter; use std::sync::Arc; use super::ordering::collapse_lex_ordering; @@ -279,6 +281,12 @@ impl EquivalenceProperties { self.with_constants(constants) } + /// Remove the specified constant + pub fn remove_constant(mut self, c: &ConstExpr) -> Self { + self.constants.retain(|existing| existing != c); + self + } + /// Track/register physical expressions with constant values. pub fn with_constants( mut self, @@ -1120,15 +1128,7 @@ impl Display for EquivalenceProperties { write!(f, ", eq: {}", self.eq_group)?; } if !self.constants.is_empty() { - write!(f, ", const: [")?; - let mut iter = self.constants.iter(); - if let Some(c) = iter.next() { - write!(f, "{}", c)?; - } - for c in iter { - write!(f, ", {}", c)?; - } - write!(f, "]")?; + write!(f, ", const: [{}]", ConstExpr::format_list(&self.constants))?; } Ok(()) } @@ -1646,58 +1646,62 @@ impl Hash for ExprWrapper { /// Calculates the union (in the sense of `UnionExec`) `EquivalenceProperties` /// of `lhs` and `rhs` according to the schema of `lhs`. +/// +/// Rules: The UnionExec does not interleave its inputs: instead it passes each +/// input partition from the children as its own output. +/// +/// Since the output equivalence properties are properties that are true for +/// *all* output partitions, that is the same as being true for all *input* +/// partitions fn calculate_union_binary( - lhs: EquivalenceProperties, + mut lhs: EquivalenceProperties, mut rhs: EquivalenceProperties, ) -> Result { - // TODO: In some cases, we should be able to preserve some equivalence - // classes. Add support for such cases. - // Harmonize the schema of the rhs with the schema of the lhs (which is the accumulator schema): if !rhs.schema.eq(&lhs.schema) { rhs = rhs.with_new_schema(Arc::clone(&lhs.schema))?; } - // First, calculate valid constants for the union. A quantity is constant - // after the union if it is constant in both sides. - let constants = lhs + // First, calculate valid constants for the union. An expression is constant + // at the output of the union if it is constant in both sides. + let constants: Vec<_> = lhs .constants() .iter() .filter(|const_expr| const_exprs_contains(rhs.constants(), const_expr.expr())) .map(|const_expr| { - // TODO: When both sides' constants are valid across partitions, - // the union's constant should also be valid if values are - // the same. However, we do not have the capability to - // check this yet. + // TODO: When both sides have a constant column, and the actual + // constant value is the same, then the output properties could + // reflect the constant is valid across all partitions. However we + // don't track the actual value that the ConstExpr takes on, so we + // can't determine that yet ConstExpr::new(Arc::clone(const_expr.expr())).with_across_partitions(false) }) .collect(); + // remove any constants that are shared in both outputs (avoid double counting them) + for c in &constants { + lhs = lhs.remove_constant(c); + rhs = rhs.remove_constant(c); + } + // Next, calculate valid orderings for the union by searching for prefixes // in both sides. - let mut orderings = vec![]; - for mut ordering in lhs.normalized_oeq_class().orderings { - // Progressively shorten the ordering to search for a satisfied prefix: - while !rhs.ordering_satisfy(&ordering) { - ordering.pop(); - } - // There is a non-trivial satisfied prefix, add it as a valid ordering: - if !ordering.is_empty() { - orderings.push(ordering); - } - } - for mut ordering in rhs.normalized_oeq_class().orderings { - // Progressively shorten the ordering to search for a satisfied prefix: - while !lhs.ordering_satisfy(&ordering) { - ordering.pop(); - } - // There is a non-trivial satisfied prefix, add it as a valid ordering: - if !ordering.is_empty() { - orderings.push(ordering); - } - } - let mut eq_properties = EquivalenceProperties::new(lhs.schema); - eq_properties.constants = constants; + let mut orderings = UnionEquivalentOrderingBuilder::new(); + orderings.add_satisfied_orderings( + lhs.normalized_oeq_class().orderings, + lhs.constants(), + &rhs, + ); + orderings.add_satisfied_orderings( + rhs.normalized_oeq_class().orderings, + rhs.constants(), + &lhs, + ); + let orderings = orderings.build(); + + let mut eq_properties = + EquivalenceProperties::new(lhs.schema).with_constants(constants); + eq_properties.add_new_orderings(orderings); Ok(eq_properties) } @@ -1730,6 +1734,206 @@ pub fn calculate_union( Ok(acc) } +#[derive(Debug)] +enum AddedOrdering { + /// The ordering was added to the in progress result + Yes, + /// The ordering was not added + No(LexOrdering), +} + +/// Builds valid output orderings of a `UnionExec` +#[derive(Debug)] +struct UnionEquivalentOrderingBuilder { + orderings: Vec, +} + +impl UnionEquivalentOrderingBuilder { + fn new() -> Self { + Self { orderings: vec![] } + } + + /// Add all orderings from `orderings` that satisfy `properties`, + /// potentially augmented with`constants`. + /// + /// Note: any column that is known to be constant can be inserted into the + /// ordering without changing its meaning + /// + /// For example: + /// * `orderings` contains `[a ASC, c ASC]` and `constants` contains `b` + /// * `properties` has required ordering `[a ASC, b ASC]` + /// + /// Then this will add `[a ASC, b ASC]` to the `orderings` list (as `a` was + /// in the sort order and `b` was a constant). + fn add_satisfied_orderings( + &mut self, + orderings: impl IntoIterator, + constants: &[ConstExpr], + properties: &EquivalenceProperties, + ) { + for mut ordering in orderings.into_iter() { + // Progressively shorten the ordering to search for a satisfied prefix: + loop { + match self.try_add_ordering(ordering, constants, properties) { + AddedOrdering::Yes => break, + AddedOrdering::No(o) => { + ordering = o; + ordering.pop(); + } + } + } + } + } + + /// Adds `ordering`, potentially augmented with constants, if it satisfies + /// the target `properties` properties. + /// + /// Returns + /// + /// * [`AddedOrdering::Yes`] if the ordering was added (either directly or + /// augmented), or was empty. + /// + /// * [`AddedOrdering::No`] if the ordering was not added + fn try_add_ordering( + &mut self, + ordering: LexOrdering, + constants: &[ConstExpr], + properties: &EquivalenceProperties, + ) -> AddedOrdering { + if ordering.is_empty() { + AddedOrdering::Yes + } else if constants.is_empty() && properties.ordering_satisfy(&ordering) { + // If the ordering satisfies the target properties, no need to + // augment it with constants. + self.orderings.push(ordering); + AddedOrdering::Yes + } else { + // Did not satisfy target properties, try and augment with constants + // to match the properties + if self.try_find_augmented_ordering(&ordering, constants, properties) { + AddedOrdering::Yes + } else { + AddedOrdering::No(ordering) + } + } + } + + /// Attempts to add `constants` to `ordering` to satisfy the properties. + /// + /// returns true if any orderings were added, false otherwise + fn try_find_augmented_ordering( + &mut self, + ordering: &LexOrdering, + constants: &[ConstExpr], + properties: &EquivalenceProperties, + ) -> bool { + // can't augment if there is nothing to augment with + if constants.is_empty() { + return false; + } + let start_num_orderings = self.orderings.len(); + + // for each equivalent ordering in properties, try and augment + // `ordering` it with the constants to match + for existing_ordering in &properties.oeq_class.orderings { + if let Some(augmented_ordering) = self.augment_ordering( + ordering, + constants, + existing_ordering, + &properties.constants, + ) { + if !augmented_ordering.is_empty() { + assert!(properties.ordering_satisfy(&augmented_ordering)); + self.orderings.push(augmented_ordering); + } + } + } + + self.orderings.len() > start_num_orderings + } + + /// Attempts to augment the ordering with constants to match the + /// `existing_ordering` + /// + /// Returns Some(ordering) if an augmented ordering was found, None otherwise + fn augment_ordering( + &mut self, + ordering: &LexOrdering, + constants: &[ConstExpr], + existing_ordering: &LexOrdering, + existing_constants: &[ConstExpr], + ) -> Option { + let mut augmented_ordering = vec![]; + let mut sort_expr_iter = ordering.iter().peekable(); + let mut existing_sort_expr_iter = existing_ordering.iter().peekable(); + + // walk in parallel down the two orderings, trying to match them up + while sort_expr_iter.peek().is_some() || existing_sort_expr_iter.peek().is_some() + { + // If the next expressions are equal, add the next match + // otherwise try and match with a constant + if let Some(expr) = + advance_if_match(&mut sort_expr_iter, &mut existing_sort_expr_iter) + { + augmented_ordering.push(expr); + } else if let Some(expr) = + advance_if_matches_constant(&mut sort_expr_iter, existing_constants) + { + augmented_ordering.push(expr); + } else if let Some(expr) = + advance_if_matches_constant(&mut existing_sort_expr_iter, constants) + { + augmented_ordering.push(expr); + } else { + // no match, can't continue the ordering, return what we have + break; + } + } + + Some(augmented_ordering) + } + + fn build(self) -> Vec { + self.orderings + } +} + +/// Advances two iterators in parallel +/// +/// If the next expressions are equal, the iterators are advanced and returns +/// the matched expression . +/// +/// Otherwise, the iterators are left unchanged and return `None` +fn advance_if_match( + iter1: &mut Peekable>, + iter2: &mut Peekable>, +) -> Option { + if matches!((iter1.peek(), iter2.peek()), (Some(expr1), Some(expr2)) if expr1.eq(expr2)) + { + iter1.next().unwrap(); + iter2.next().cloned() + } else { + None + } +} + +/// Advances the iterator with a constant +/// +/// If the next expression matches one of the constants, advances the iterator +/// returning the matched expression +/// +/// Otherwise, the iterator is left unchanged and returns `None` +fn advance_if_matches_constant( + iter: &mut Peekable>, + constants: &[ConstExpr], +) -> Option { + let expr = iter.peek()?; + let const_expr = constants.iter().find(|c| c.eq_expr(expr))?; + let found_expr = PhysicalSortExpr::new(Arc::clone(const_expr.expr()), expr.options); + iter.next(); + Some(found_expr) +} + #[cfg(test)] mod tests { use std::ops::Not; @@ -2874,7 +3078,7 @@ mod tests { } #[test] - fn test_union_equivalence_properties_constants_1() { + fn test_union_equivalence_properties_constants_common_constants() { let schema = create_test_schema().unwrap(); UnionEquivalenceTest::new(&schema) .with_child_sort_and_const_exprs( @@ -2898,10 +3102,9 @@ mod tests { } #[test] - fn test_union_equivalence_properties_constants_2() { + fn test_union_equivalence_properties_constants_prefix() { let schema = create_test_schema().unwrap(); UnionEquivalenceTest::new(&schema) - // Meet ordering between [a ASC], [a ASC, b ASC] should be [a ASC] .with_child_sort_and_const_exprs( // First child: [a ASC], const [] vec![vec!["a"]], @@ -2923,10 +3126,9 @@ mod tests { } #[test] - fn test_union_equivalence_properties_constants_3() { + fn test_union_equivalence_properties_constants_asc_desc_mismatch() { let schema = create_test_schema().unwrap(); UnionEquivalenceTest::new(&schema) - // Meet ordering between [a ASC], [a DESC] should be [] .with_child_sort_and_const_exprs( // First child: [a ASC], const [] vec![vec!["a"]], @@ -2948,7 +3150,7 @@ mod tests { } #[test] - fn test_union_equivalence_properties_constants_4() { + fn test_union_equivalence_properties_constants_different_schemas() { let schema = create_test_schema().unwrap(); let schema2 = append_fields(&schema, "1"); UnionEquivalenceTest::new(&schema) @@ -2965,11 +3167,10 @@ mod tests { &schema2, ) .with_expected_sort_and_const_exprs( - // Union orderings: - // should be [a ASC] + // Union orderings: [a ASC] // - // Where a, and a1 ath the same index for their corresponding - // schemas. + // Note that a, and a1 are at the same index for their + // corresponding schemas. vec![vec!["a"]], vec![], ) @@ -2977,9 +3178,7 @@ mod tests { } #[test] - #[ignore] - // ignored due to https://github.com/apache/datafusion/issues/12446 - fn test_union_equivalence_properties_constants() { + fn test_union_equivalence_properties_constants_fill_gaps() { let schema = create_test_schema().unwrap(); UnionEquivalenceTest::new(&schema) .with_child_sort_and_const_exprs( @@ -3006,13 +3205,58 @@ mod tests { } #[test] - #[ignore] - // ignored due to https://github.com/apache/datafusion/issues/12446 - fn test_union_equivalence_properties_constants_desc() { + fn test_union_equivalence_properties_constants_no_fill_gaps() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child orderings: [a ASC, c ASC], const [d] // some other constant + vec![vec!["a", "c"]], + vec!["d"], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child orderings: [b ASC, c ASC], const [a] + vec![vec!["b", "c"]], + vec!["a"], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: [[a]] (only a is constant) + vec![vec!["a"]], + vec![], + ) + .run() + } + + #[test] + fn test_union_equivalence_properties_constants_fill_some_gaps() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child orderings: [c ASC], const [a, b] // some other constant + vec![vec!["c"]], + vec!["a", "b"], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child orderings: [a DESC, b], const [] + vec![vec!["a DESC", "b"]], + vec![], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: [[a, b]] (can fill in the a/b with constants) + vec![vec!["a DESC", "b"]], + vec![], + ) + .run() + } + + #[test] + fn test_union_equivalence_properties_constants_fill_gaps_non_symmetric() { let schema = create_test_schema().unwrap(); UnionEquivalenceTest::new(&schema) .with_child_sort_and_const_exprs( - // NB `b DESC` in the second child // First child orderings: [a ASC, c ASC], const [b] vec![vec!["a", "c"]], vec!["b"], @@ -3036,9 +3280,7 @@ mod tests { } #[test] - #[ignore] - // ignored due to https://github.com/apache/datafusion/issues/12446 - fn test_union_equivalence_properties_constants_middle() { + fn test_union_equivalence_properties_constants_gap_fill_symmetric() { let schema = create_test_schema().unwrap(); UnionEquivalenceTest::new(&schema) .with_child_sort_and_const_exprs( @@ -3055,8 +3297,8 @@ mod tests { ) .with_expected_sort_and_const_exprs( // Union orderings: - // [a, b, d] (c constant) - // [a, c, d] (b constant) + // [a, b, c, d] + // [a, c, b, d] vec![vec!["a", "c", "b", "d"], vec!["a", "b", "c", "d"]], vec![], ) @@ -3064,8 +3306,31 @@ mod tests { } #[test] - #[ignore] - // ignored due to https://github.com/apache/datafusion/issues/12446 + fn test_union_equivalence_properties_constants_gap_fill_and_common() { + let schema = create_test_schema().unwrap(); + UnionEquivalenceTest::new(&schema) + .with_child_sort_and_const_exprs( + // First child: [a DESC, d ASC], const [b, c] + vec![vec!["a DESC", "d"]], + vec!["b", "c"], + &schema, + ) + .with_child_sort_and_const_exprs( + // Second child: [a DESC, c ASC, d ASC], const [b] + vec![vec!["a DESC", "c", "d"]], + vec!["b"], + &schema, + ) + .with_expected_sort_and_const_exprs( + // Union orderings: + // [a DESC, c, d] [b] + vec![vec!["a DESC", "c", "d"]], + vec!["b"], + ) + .run() + } + + #[test] fn test_union_equivalence_properties_constants_middle_desc() { let schema = create_test_schema().unwrap(); UnionEquivalenceTest::new(&schema) @@ -3176,18 +3441,32 @@ mod tests { child_properties, expected_properties, } = self; + let expected_properties = expected_properties.expect("expected_properties not set"); - let actual_properties = - calculate_union(child_properties, Arc::clone(&output_schema)) - .expect("failed to calculate union equivalence properties"); - assert_eq_properties_same( - &actual_properties, - &expected_properties, - format!( - "expected: {expected_properties:?}\nactual: {actual_properties:?}" - ), - ); + + // try all permutations of the children + // as the code treats lhs and rhs differently + for child_properties in child_properties + .iter() + .cloned() + .permutations(child_properties.len()) + { + println!("--- permutation ---"); + for c in &child_properties { + println!("{c}"); + } + let actual_properties = + calculate_union(child_properties, Arc::clone(&output_schema)) + .expect("failed to calculate union equivalence properties"); + assert_eq_properties_same( + &actual_properties, + &expected_properties, + format!( + "expected: {expected_properties:?}\nactual: {actual_properties:?}" + ), + ); + } } /// Make equivalence properties for the specified columns named in orderings and constants