diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 6ca0fcf6b333..3439af06ae4b 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -70,7 +70,7 @@ body: attributes: label: "OS:" placeholder: e.g., iOS - description: Please include operating system version and architecture (e.g., aarch64, x86, x64, etc) + description: Please include operating system version and architecture (e.g., aarch64, x86_64, etc.). validations: required: true - type: input @@ -85,7 +85,12 @@ body: placeholder: e.g., Python validations: required: true - + - type: input + attributes: + label: "Hardware:" + placeholder: If your issue is performance-related, please include information on your CPU and memory. + validations: + required: false - type: markdown attributes: value: "# Identity Disclosure:" diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake index edd9ad14ffb4..c17150e1d45a 100644 --- a/.github/config/out_of_tree_extensions.cmake +++ b/.github/config/out_of_tree_extensions.cmake @@ -93,7 +93,7 @@ if (NOT MINGW) duckdb_extension_load(postgres_scanner DONT_LINK GIT_URL https://github.com/duckdb/postgres_scanner - GIT_TAG d0e0115f29a9dbe44f026aea7290a1c6d4622a73 + GIT_TAG 58dc3d0c7620f3978c27d4a490563fd65884d103 ) endif() @@ -101,7 +101,7 @@ endif() duckdb_extension_load(spatial DONT_LINK LOAD_TESTS GIT_URL https://github.com/duckdb/duckdb_spatial.git - GIT_TAG 0bc9dff490cff60526061e7c72dceed945d3b3a3 + GIT_TAG 58e0fcd09f2306803da36c4b1e8a66bb1e263316 INCLUDE_DIR spatial/include TEST_DIR test/sql ) @@ -117,7 +117,7 @@ endif() duckdb_extension_load(sqlite_scanner ${STATIC_LINK_SQLITE} LOAD_TESTS GIT_URL https://github.com/duckdb/sqlite_scanner - GIT_TAG 647f1403791890b65a1419841df02bf17d634639 + GIT_TAG 315861963c8106397af36cbda10faebc8dae485a ) duckdb_extension_load(sqlsmith @@ -131,7 +131,7 @@ if (NOT WIN32) duckdb_extension_load(substrait LOAD_TESTS DONT_LINK GIT_URL https://github.com/duckdb/substrait - GIT_TAG 55922a3e77756054abbe3e04dae17ccf4203ad6f + GIT_TAG 800be4945807b831754f6b0d1a064a3d30f9cada ) endif() @@ -141,7 +141,7 @@ duckdb_extension_load(vss LOAD_TESTS DONT_LINK GIT_URL https://github.com/duckdb/duckdb_vss - GIT_TAG 3e192f25de97bdd759f96eeb488c59750db73937 + GIT_TAG 77739ea5382cce3220af83803ac0b1e98b3ab7d8 TEST_DIR test/sql ) @@ -151,6 +151,6 @@ if (NOT MINGW) DONT_LINK LOAD_TESTS GIT_URL https://github.com/duckdb/duckdb_mysql - GIT_TAG 64cb6aec994fbe441157086599c265eb86303c84 + GIT_TAG d0c56abf1169cca1c54f55448b68a85ae4279ea4 ) endif() diff --git a/.github/workflows/LinuxRelease.yml b/.github/workflows/LinuxRelease.yml index 354a452e8e14..708255fcfa67 100644 --- a/.github/workflows/LinuxRelease.yml +++ b/.github/workflows/LinuxRelease.yml @@ -68,6 +68,7 @@ jobs: steps: - uses: actions/checkout@v3 with: + fetch-depth: 0 ref: ${{ inputs.git_ref }} - uses: ./.github/actions/manylinux_2014_setup diff --git a/.github/workflows/NightlyTests.yml b/.github/workflows/NightlyTests.yml index b35181111add..24eb8372a241 100644 --- a/.github/workflows/NightlyTests.yml +++ b/.github/workflows/NightlyTests.yml @@ -470,6 +470,7 @@ jobs: DISABLE_STRING_INLINE: 1 DESTROY_UNPINNED_BLOCKS: 1 ALTERNATIVE_VERIFY: 1 + LSAN_OPTIONS: suppressions=${{ github.workspace }}/.sanitizer-leak-suppressions.txt steps: - uses: actions/checkout@v3 @@ -530,7 +531,8 @@ jobs: - name: Test shell: bash - run: build/reldebug/test/unittest "*" + run: | + python3 scripts/run_tests_one_by_one.py build/reldebug/test/unittest "*" --no-exit --time_execution vector-verification: name: Vector Verification Tests (${{ matrix.vector_type }}) @@ -757,6 +759,7 @@ jobs: python3 scripts/run_tests_one_by_one.py build/reldebug/test/unittest "[detailed_profiler]" --no-exit --timeout 600 python3 scripts/run_tests_one_by_one.py build/reldebug/test/unittest test/sql/tpch/tpch_sf01.test_slow --no-exit --timeout 600 + vector-sizes: name: Vector Sizes runs-on: ubuntu-20.04 @@ -764,25 +767,32 @@ jobs: env: CC: gcc-10 CXX: g++-10 + GEN: ninja + STANDARD_VECTOR_SIZE: 2 steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 + - uses: actions/checkout@v3 + with: + fetch-depth: 0 - - uses: actions/setup-python@v5 - with: - python-version: '3.12' + - name: Install + shell: bash + run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build - - name: Setup Ccache - uses: hendrikmuhs/ccache-action@main - with: - key: ${{ github.job }} - save: ${{ env.CCACHE_SAVE }} + - name: Setup Ccache + uses: hendrikmuhs/ccache-action@main + with: + key: ${{ github.job }} + save: ${{ env.CCACHE_SAVE }} - - name: Test - shell: bash - run: python scripts/test_vector_sizes.py + - name: Build + shell: bash + run: make relassert + + - name: Test + shell: bash + run: | + python3 scripts/run_tests_one_by_one.py build/relassert/test/unittest --no-exit --time_execution block-sizes: name: Block Sizes @@ -791,15 +801,19 @@ jobs: env: CC: gcc-10 CXX: g++-10 + GEN: ninja + BLOCK_ALLOC_SIZE: 16384 + BUILD_JSON: 1 + BUILD_PARQUET: 1 steps: - uses: actions/checkout@v3 with: fetch-depth: 0 - - uses: actions/setup-python@v5 - with: - python-version: '3.12' + - name: Install + shell: bash + run: sudo apt-get update -y -qq && sudo apt-get install -y -qq ninja-build - name: Setup Ccache uses: hendrikmuhs/ccache-action@main @@ -807,9 +821,25 @@ jobs: key: ${{ github.job }} save: ${{ env.CCACHE_SAVE }} - - name: Test + - name: Build with standard vector size + shell: bash + run: make relassert + + - name: Fast and storage tests shell: bash - run: python scripts/test_block_sizes.py + run: | + python3 scripts/run_tests_one_by_one.py ./build/relassert/test/unittest --no-exit --time_execution + python3 scripts/run_tests_one_by_one.py ./build/relassert/test/unittest "test/sql/storage/*" --no-exit --time_execution + + - name: Build with vector size of 512 + shell: bash + run: rm -rf ./build && rm -rf ./duckdb_unittest_tempdir && make clean && STANDARD_VECTOR_SIZE=512 make relassert + + - name: Fast and storage tests + shell: bash + run: | + python3 scripts/run_tests_one_by_one.py ./build/relassert/test/unittest --no-exit --time_execution + python3 scripts/run_tests_one_by_one.py ./build/relassert/test/unittest "test/sql/storage/*" --no-exit --time_execution linux-wasm-experimental: name: WebAssembly duckdb-wasm builds diff --git a/.sanitizer-leak-suppressions.txt b/.sanitizer-leak-suppressions.txt new file mode 100644 index 000000000000..599ad3fccf69 --- /dev/null +++ b/.sanitizer-leak-suppressions.txt @@ -0,0 +1,5 @@ +# dsdgen extension global statics +leak:load_dist +leak:find_dist +leak:makePermutation +leak:init_params diff --git a/.sanitizer-thread-suppressions.txt b/.sanitizer-thread-suppressions.txt index 3f5a74142bff..f8952cbd6109 100644 --- a/.sanitizer-thread-suppressions.txt +++ b/.sanitizer-thread-suppressions.txt @@ -3,6 +3,6 @@ race:InsertMatchesAndIncrementMisses race:NextInnerJoin race:NextRightSemiOrAntiJoin race:duckdb_moodycamel -race:duckdb_jemalloc +race:*duckdb/extension/jemalloc/jemalloc/* race:AddToEvictionQueue -race:ValidityAppend \ No newline at end of file +race:ValidityAppend diff --git a/CMakeLists.txt b/CMakeLists.txt index 683853365003..7dc2bb77e06d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -215,6 +215,10 @@ if(${EXPLICIT_EXCEPTIONS}) set(CXX_EXTRA "${CXX_EXTRA} -fexceptions") endif() +if (UNSAFE_NUMERIC_CAST) + message(status "UNSAFE_NUMERIC_CAST") + add_definitions(-DUNSAFE_NUMERIC_CAST=1) +endif() if (ENABLE_EXTENSION_AUTOLOADING) add_definitions(-DDUCKDB_EXTENSION_AUTOLOAD_DEFAULT=1) endif() @@ -1375,7 +1379,8 @@ if(NOT DUCKDB_EXPLICIT_PLATFORM) ${PROJECT_BINARY_DIR}) add_custom_target( duckdb_platform ALL - COMMAND duckdb_platform_binary > duckdb_platform_out || ( echo "Provide explicit DUCKDB_PLATFORM=your_target_arch to avoid build-type detection of the platform" && exit 1 ) + COMMAND duckdb_platform_binary > ${PROJECT_BINARY_DIR}/duckdb_platform_out || ( echo "Provide explicit DUCKDB_PLATFORM=your_target_arch to avoid build-type detection of the platform" && exit 1 ) + WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} ) add_dependencies(duckdb_platform duckdb_platform_binary) else() diff --git a/Makefile b/Makefile index 6de499088099..e1c4d723eb22 100644 --- a/Makefile +++ b/Makefile @@ -94,7 +94,7 @@ endif ifneq (${ENABLE_EXTENSION_AUTOINSTALL}, "") CMAKE_VARS:=${CMAKE_VARS} -DENABLE_EXTENSION_AUTOINSTALL=${ENABLE_EXTENSION_AUTOINSTALL} endif -ifneq (${UNSAFE_NUMERIC_CAST}, "") +ifneq (${UNSAFE_NUMERIC_CAST}, ) CMAKE_VARS:=${CMAKE_VARS} -DUNSAFE_NUMERIC_CAST=1 endif ifeq (${BUILD_EXTENSIONS_ONLY}, 1) diff --git a/data/csv/fuzzing/0.csv b/data/csv/fuzzing/0.csv new file mode 100644 index 000000000000..2b454a4f2aaa --- /dev/null +++ b/data/csv/fuzzing/0.csv @@ -0,0 +1,3 @@ +0ð† testEST13 text1 +345 TEST13 'te xt2' +'#67' TESt13 tex \ No newline at end of file diff --git a/data/csv/fuzzing/1.csv b/data/csv/fuzzing/1.csv new file mode 100644 index 000000000000..546fbbee759d --- /dev/null +++ b/data/csv/fuzzing/1.csv @@ -0,0 +1,3 @@ + bl1|2 +1690|00i\047v|2 +4a; bla; b lbla; bla; b la; 1a# 1;2;3;4;5 \ No newline at end of file diff --git a/data/csv/fuzzing/10.csv b/data/csv/fuzzing/10.csv new file mode 100644 index 000000000000..5b15e676dc6e Binary files /dev/null and b/data/csv/fuzzing/10.csv differ diff --git a/data/csv/fuzzing/11.csv b/data/csv/fuzzing/11.csv new file mode 100644 index 000000000000..ef245b4d066f --- /dev/null +++ b/data/csv/fuzzing/11.csv @@ -0,0 +1,7 @@ +a +10"10-2020 +10-10-2020 +10-1-# A few comments +# I'm a csv file +a;-10-2020 +10-1-# A fedd \ No newline at end of file diff --git a/data/csv/fuzzing/12.csv b/data/csv/fuzzing/12.csv new file mode 100644 index 000000000000..946edcf72de9 Binary files /dev/null and b/data/csv/fuzzing/12.csv differ diff --git a/data/csv/fuzzing/13.csv b/data/csv/fuzzing/13.csv new file mode 100644 index 000000000000..5b1cf5bf4c18 Binary files /dev/null and b/data/csv/fuzzing/13.csv differ diff --git a/data/csv/fuzzing/14.csv b/data/csv/fuzzing/14.csv new file mode 100644 index 000000000000..cb8878f5f8a9 Binary files /dev/null and b/data/csv/fuzzing/14.csv differ diff --git a/data/csv/fuzzing/15.csv b/data/csv/fuzzing/15.csv new file mode 100644 index 000000000000..b1181ae3ac51 --- /dev/null +++ b/data/csv/fuzzing/15.csv @@ -0,0 +1,7 @@ +a +10-10-2020 +; ; 3b +1;2;3;4;51/-10-2020 +10-10-# A ÿew commenad +# I'm a csv fildó`;b +# This is also a btsd \ No newline at end of file diff --git a/data/csv/fuzzing/16.csv b/data/csv/fuzzing/16.csv new file mode 100644 index 000000000000..16810496fbf7 --- /dev/null +++ b/data/csv/fuzzing/16.csv @@ -0,0 +1,7 @@ +sv file +a;b +# This o a ba +1O-10-2020|||||u|||||||||||-# A new c@ÿÿnt} +# I'm a csv file +a;b +# This is mlso a badd \ No newline at end of file diff --git a/data/csv/fuzzing/17.csv b/data/csv/fuzzing/17.csv new file mode 100644 index 000000000000..bafb6ab13d5f --- /dev/null +++ b/data/csv/fuzzing/17.csv @@ -0,0 +1,8 @@ +a +10-10-2020 +10-10-2020 +10-:02:03;0# this Ns i file +10-# Aew com-ents +# I'm a csv1:02:03;0# this Ns i file +a;b +# This is also a badd \ No newline at end of file diff --git a/data/csv/fuzzing/18.csv b/data/csv/fuzzing/18.csv new file mode 100644 index 000000000000..7952f702856d --- /dev/null +++ b/data/csv/fuzzing/18.csv @@ -0,0 +1,7 @@ +a +10-10-2020 +10-10-2020 +10-10-# A few c‹mments +# I'm a csv file +a;b + aThis is# lso a badd \ No newline at end of file diff --git a/data/csv/fuzzing/19.csv b/data/csv/fuzzing/19.csv new file mode 100644 index 000000000000..8cc109ba548d --- /dev/null +++ b/data/csv/fuzzing/19.csv @@ -0,0 +1,7 @@ +a&10-10-2020 +10-134fd321 +fg50a&10-10-2020 +10-10-2020 +10-10--# A few ||1 +||||||||||||||| a csv file +a;b # ThiW is also a Oadd \ No newline at end of file diff --git a/data/csv/fuzzing/2.csv b/data/csv/fuzzing/2.csv new file mode 100644 index 000000000000..5881bc71f9f6 --- /dev/null +++ b/data/csv/fuzzing/2.csv @@ -0,0 +1,5 @@ +1 bl00000|19 4|658 +10000000|a; bla; b la; ; 3b +1;2;3;4;5 +1;2;3;4;5 +1#2;3;4;5 \ No newline at end of file diff --git a/data/csv/fuzzing/20.csv b/data/csv/fuzzing/20.csv new file mode 100644 index 000000000000..c33da3187070 --- /dev/null +++ b/data/csv/fuzzing/20.csv @@ -0,0 +1,3 @@ +@|1few comme010-10-2020 +10 10-# A 0-150-202nus +# J'm a csv file´a;aí# Th–s is also a badd \ No newline at end of file diff --git a/data/csv/fuzzing/21.csv b/data/csv/fuzzing/21.csv new file mode 100644 index 000000000000..6f76a4570a66 --- /dev/null +++ b/data/csv/fuzzing/21.csv @@ -0,0 +1,7 @@ +a +10 I'm a csv fe +a;b +# +0-10-200YYYYYYYYYY# I'm a csv file +a;b +# This@is also a bad \ No newline at end of file diff --git a/data/csv/fuzzing/22.csv b/data/csv/fuzzing/22.csv new file mode 100644 index 000000000000..19e211933b74 Binary files /dev/null and b/data/csv/fuzzing/22.csv differ diff --git a/data/csv/fuzzing/23.csv b/data/csv/fuzzing/23.csv new file mode 100644 index 000000000000..eb5f4fadca02 Binary files /dev/null and b/data/csv/fuzzing/23.csv differ diff --git a/data/csv/fuzzing/24.csv b/data/csv/fuzzing/24.csv new file mode 100644 index 000000000000..5c5938f38f55 Binary files /dev/null and b/data/csv/fuzzing/24.csv differ diff --git a/data/csv/fuzzing/25.csv b/data/csv/fuzzing/25.csv new file mode 100644 index 000000000000..abdd678c94f8 Binary files /dev/null and b/data/csv/fuzzing/25.csv differ diff --git a/data/csv/fuzzing/26.csv b/data/csv/fuzzing/26.csv new file mode 100644 index 000000000000..a6a8be9a0aad Binary files /dev/null and b/data/csv/fuzzing/26.csv differ diff --git a/data/csv/fuzzing/27.csv b/data/csv/fuzzing/27.csv new file mode 100644 index 000000000000..cdf23eec20da Binary files /dev/null and b/data/csv/fuzzing/27.csv differ diff --git a/data/csv/fuzzing/28.csv b/data/csv/fuzzing/28.csv new file mode 100644 index 000000000000..a283f33d8d5d Binary files /dev/null and b/data/csv/fuzzing/28.csv differ diff --git a/data/csv/fuzzing/29.csv b/data/csv/fuzzing/29.csv new file mode 100644 index 000000000000..f6f1f1f9875c Binary files /dev/null and b/data/csv/fuzzing/29.csv differ diff --git a/data/csv/fuzzing/3.csv b/data/csv/fuzzing/3.csv new file mode 100644 index 000000000000..d02fcda1eca9 Binary files /dev/null and b/data/csv/fuzzing/3.csv differ diff --git a/data/csv/fuzzing/30.csv b/data/csv/fuzzing/30.csv new file mode 100644 index 000000000000..a4d087377483 Binary files /dev/null and b/data/csv/fuzzing/30.csv differ diff --git a/data/csv/fuzzing/31.csv b/data/csv/fuzzing/31.csv new file mode 100644 index 000000000000..a260b8e44d45 --- /dev/null +++ b/data/csv/fuzzing/31.csv @@ -0,0 +1,10 @@ +a +10-10-2020 +10-10-2020 +e +a;b +0-# A few comments +# I'm a -10-# A few comments +# I'm a csv file +a;b +# This is also a badd \ No newline at end of file diff --git a/data/csv/fuzzing/32.csv b/data/csv/fuzzing/32.csv new file mode 100644 index 000000000000..aa9cb04406d5 --- /dev/null +++ b/data/csv/fuzzing/32.csv @@ -0,0 +1,14 @@ +a +1€-2020 +10-1€-2020 +10-10-2020 +10-10-# A few commea,b +1,2 +1,2 +) + +ÿÿÿÿts +# I'm a csv file +a;b +## A -2020 +10-10-# A few cowmea,s žlso a badd \ No newline at end of file diff --git a/data/csv/fuzzing/33.csv b/data/csv/fuzzing/33.csv new file mode 100644 index 000000000000..7b5f149c0f68 Binary files /dev/null and b/data/csv/fuzzing/33.csv differ diff --git a/data/csv/fuzzing/34.csv b/data/csv/fuzzing/34.csv new file mode 100644 index 000000000000..9097b99f17bc --- /dev/null +++ b/data/csv/fuzzing/34.csv @@ -0,0 +1,4 @@ +2a,b,t,d,ts +123,TEST2,12:12:12,2000.01.01,2000.01.01 12:12:00 +345,TEST2,14:15:30,2002.02.02,2002.02.02 14:15:00 +b # This2is al1017,2004.16:00 \ No newline at end of file diff --git a/data/csv/fuzzing/35.csv b/data/csv/fuzzing/35.csv new file mode 100644 index 000000000000..c1dd0e7a52c3 Binary files /dev/null and b/data/csv/fuzzing/35.csv differ diff --git a/data/csv/fuzzing/36.csv b/data/csv/fuzzing/36.csv new file mode 100644 index 000000000000..701ac73eebd4 --- /dev/null +++ b/data/csv/fuzzing/36.csv @@ -0,0 +1,2 @@ +2.2.2;;1.2.2,twR,t2 bla hwee,1.2.2,twR,t22,twR,t2 0 +10-10-10-#0 bla hwee,fru \ No newline at end of file diff --git a/data/csv/fuzzing/37.csv b/data/csv/fuzzing/37.csv new file mode 100644 index 000000000000..a5d43b6573bd --- /dev/null +++ b/data/csv/fuzzing/37.csv @@ -0,0 +1,2 @@ +2.2.2;:twR,t,1.2.22 bla;3 ll wi13 text1 +345 xt2'#'5teTE bla blG;- hwee,fru \ No newline at end of file diff --git a/data/csv/fuzzing/38.csv b/data/csv/fuzzing/38.csv new file mode 100644 index 000000000000..361bd6a2ba19 Binary files /dev/null and b/data/csv/fuzzing/38.csv differ diff --git a/data/csv/fuzzing/4.csv b/data/csv/fuzzing/4.csv new file mode 100644 index 000000000000..e9bd430b6437 --- /dev/null +++ b/data/csv/fuzzing/4.csv @@ -0,0 +1,3 @@ +3b +1;2;# +1Y3#5: diff --git a/data/csv/fuzzing/5.csv b/data/csv/fuzzing/5.csv new file mode 100644 index 000000000000..04764888e7fb Binary files /dev/null and b/data/csv/fuzzing/5.csv differ diff --git a/data/csv/fuzzing/6.csv b/data/csv/fuzzing/6.csv new file mode 100644 index 000000000000..58415ec452a3 --- /dev/null +++ b/data/csv/fuzzing/6.csv @@ -0,0 +1,8 @@ +a +10------d--------5|"p2 +bla +blb +bla +blaosie boogie,3, 223-01le-a;b +# T0 +10-10-# Rßa badd \ No newline at end of file diff --git a/data/csv/fuzzing/7.csv b/data/csv/fuzzing/7.csv new file mode 100644 index 000000000000..3a8809d0ab54 Binary files /dev/null and b/data/csv/fuzzing/7.csv differ diff --git a/data/csv/fuzzing/8.csv b/data/csv/fuzzing/8.csv new file mode 100644 index 000000000000..52018657802d Binary files /dev/null and b/data/csv/fuzzing/8.csv differ diff --git a/data/csv/fuzzing/9.csv b/data/csv/fuzzing/9.csv new file mode 100644 index 000000000000..6ae9228a512c Binary files /dev/null and b/data/csv/fuzzing/9.csv differ diff --git a/data/json/13725/month=07/mytest.json b/data/json/13725/month=07/mytest.json new file mode 100644 index 000000000000..e9d008e5d983 --- /dev/null +++ b/data/json/13725/month=07/mytest.json @@ -0,0 +1 @@ +{"hello": "there"} diff --git a/extension/icu/icu-datefunc.cpp b/extension/icu/icu-datefunc.cpp index aeab8379efbd..b0202f8d9dc2 100644 --- a/extension/icu/icu-datefunc.cpp +++ b/extension/icu/icu-datefunc.cpp @@ -73,6 +73,10 @@ unique_ptr ICUDateFunc::Bind(ClientContext &context, ScalarFunctio void ICUDateFunc::SetTimeZone(icu::Calendar *calendar, const string_t &tz_id) { auto tz = icu_66::TimeZone::createTimeZone(icu::UnicodeString::fromUTF8(icu::StringPiece(tz_id.GetString()))); + if (*tz == icu::TimeZone::getUnknown()) { + delete tz; + throw NotImplementedException("Unknown TimeZone '%s'", tz_id.GetString()); + } calendar->adoptTimeZone(tz); } @@ -83,7 +87,7 @@ timestamp_t ICUDateFunc::GetTimeUnsafe(icu::Calendar *calendar, uint64_t micros) if (U_FAILURE(status)) { throw InternalException("Unable to get ICU calendar time."); } - return timestamp_t(millis * Interval::MICROS_PER_MSEC + micros); + return timestamp_t(millis * Interval::MICROS_PER_MSEC + int64_t(micros)); } bool ICUDateFunc::TryGetTime(icu::Calendar *calendar, uint64_t micros, timestamp_t &result) { @@ -98,7 +102,7 @@ bool ICUDateFunc::TryGetTime(icu::Calendar *calendar, uint64_t micros, timestamp if (!TryMultiplyOperator::Operation(millis, Interval::MICROS_PER_MSEC, millis)) { return false; } - if (!TryAddOperator::Operation(millis, micros, millis)) { + if (!TryAddOperator::Operation(millis, int64_t(micros), millis)) { return false; } diff --git a/extension/icu/icu-timezone.cpp b/extension/icu/icu-timezone.cpp index bd0f9d213417..0a91ef759144 100644 --- a/extension/icu/icu-timezone.cpp +++ b/extension/icu/icu-timezone.cpp @@ -125,7 +125,7 @@ struct ICUFromNaiveTimestamp : public ICUDateFunc { int32_t secs; int32_t frac; Time::Convert(local_time, hr, mn, secs, frac); - int32_t millis = frac / Interval::MICROS_PER_MSEC; + int32_t millis = frac / int32_t(Interval::MICROS_PER_MSEC); uint64_t micros = frac % Interval::MICROS_PER_MSEC; // Use them to set the time in the time zone @@ -199,7 +199,7 @@ struct ICUToNaiveTimestamp : public ICUDateFunc { } // Extract the time zone parts - auto micros = SetTime(calendar, instant); + auto micros = int32_t(SetTime(calendar, instant)); const auto era = ExtractField(calendar, UCAL_ERA); const auto year = ExtractField(calendar, UCAL_YEAR); const auto mm = ExtractField(calendar, UCAL_MONTH) + 1; @@ -216,7 +216,7 @@ struct ICUToNaiveTimestamp : public ICUDateFunc { const auto secs = ExtractField(calendar, UCAL_SECOND); const auto millis = ExtractField(calendar, UCAL_MILLISECOND); - micros += millis * Interval::MICROS_PER_MSEC; + micros += millis * int32_t(Interval::MICROS_PER_MSEC); dtime_t local_time = Time::FromTime(hr, mn, secs, micros); timestamp_t naive; diff --git a/extension/icu/icu_extension.cpp b/extension/icu/icu_extension.cpp index 707bd35a3b18..e5c038805eb0 100644 --- a/extension/icu/icu_extension.cpp +++ b/extension/icu/icu_extension.cpp @@ -42,7 +42,7 @@ struct IcuBindData : public FunctionData { string country; string tag; - IcuBindData(duckdb::unique_ptr collator_p) : collator(std::move(collator_p)) { + explicit IcuBindData(duckdb::unique_ptr collator_p) : collator(std::move(collator_p)) { } IcuBindData(string language_p, string country_p) : language(std::move(language_p)), country(std::move(country_p)) { @@ -59,7 +59,7 @@ struct IcuBindData : public FunctionData { } } - IcuBindData(string tag_p) : tag(std::move(tag_p)) { + explicit IcuBindData(string tag_p) : tag(std::move(tag_p)) { UErrorCode status = U_ZERO_ERROR; UCollator *ucollator = ucol_open(tag.c_str(), &status); if (U_FAILURE(status)) { @@ -120,7 +120,7 @@ const string IcuBindData::FUNCTION_PREFIX = "icu_collate_"; static int32_t ICUGetSortKey(icu::Collator &collator, string_t input, duckdb::unique_ptr &buffer, int32_t &buffer_size) { icu::UnicodeString unicode_string = - icu::UnicodeString::fromUTF8(icu::StringPiece(input.GetData(), input.GetSize())); + icu::UnicodeString::fromUTF8(icu::StringPiece(input.GetData(), int32_t(input.GetSize()))); int32_t string_size = collator.getSortKey(unicode_string, reinterpret_cast(buffer.get()), buffer_size); if (string_size > buffer_size) { // have to resize the buffer @@ -204,18 +204,19 @@ static ScalarFunction GetICUCollateFunction(const string &collation, const strin string fname = IcuBindData::EncodeFunctionName(collation); ScalarFunction result(fname, {LogicalType::VARCHAR}, LogicalType::VARCHAR, ICUCollateFunction, ICUCollateBind); //! collation tag is added into the Function extra info - result.extra_info = std::move(tag); + result.extra_info = tag; result.serialize = IcuBindData::Serialize; result.deserialize = IcuBindData::Deserialize; return result; } static void SetICUTimeZone(ClientContext &context, SetScope scope, Value ¶meter) { - icu::StringPiece utf8(StringValue::Get(parameter)); + auto str = StringValue::Get(parameter); + icu::StringPiece utf8(str); const auto uid = icu::UnicodeString::fromUTF8(utf8); duckdb::unique_ptr tz(icu::TimeZone::createTimeZone(uid)); if (*tz == icu::TimeZone::getUnknown()) { - throw NotImplementedException("Unknown TimeZone setting"); + throw NotImplementedException("Unknown TimeZone '%s'", str); } } diff --git a/extension/jemalloc/jemalloc/README.md b/extension/jemalloc/jemalloc/README.md index cb93e0680c75..d06a138975eb 100644 --- a/extension/jemalloc/jemalloc/README.md +++ b/extension/jemalloc/jemalloc/README.md @@ -166,6 +166,20 @@ static bool os_overcommits_proc(void) ``` +Modify this function to only print in DEBUG mode in `malloc_io.c`. +```c++ +void +malloc_write(const char *s) { +#ifdef DEBUG + if (je_malloc_message != NULL) { + je_malloc_message(NULL, s); + } else { + wrtmessage(NULL, s); + } +#endif +} +``` + Almost no symbols are leaked due to `private_namespace.h`. The `exported_symbols_check.py` script still found a few, so these lines need to be added to `private_namespace.h`: ```c++ diff --git a/extension/jemalloc/jemalloc/src/malloc_io.c b/extension/jemalloc/jemalloc/src/malloc_io.c index 192d82081cc0..d067bc8e40b2 100644 --- a/extension/jemalloc/jemalloc/src/malloc_io.c +++ b/extension/jemalloc/jemalloc/src/malloc_io.c @@ -79,11 +79,13 @@ JEMALLOC_EXPORT void (*je_malloc_message)(void *, const char *s); */ void malloc_write(const char *s) { +#ifdef DEBUG if (je_malloc_message != NULL) { je_malloc_message(NULL, s); } else { wrtmessage(NULL, s); } +#endif } /* diff --git a/extension/jemalloc/jemalloc_extension.cpp b/extension/jemalloc/jemalloc_extension.cpp index 98d84dc889af..10a49cf99a10 100644 --- a/extension/jemalloc/jemalloc_extension.cpp +++ b/extension/jemalloc/jemalloc_extension.cpp @@ -2,7 +2,6 @@ #include "jemalloc_extension.hpp" #include "duckdb/common/allocator.hpp" -#include "duckdb/common/mutex.hpp" #include "jemalloc/jemalloc.h" namespace duckdb { @@ -88,17 +87,12 @@ void JemallocExtension::ThreadIdle() { } void JemallocExtension::FlushAll() { - static mutex lock; - // Flush thread-local cache SetJemallocCTL("thread.tcache.flush"); // Flush all arenas const auto purge_arena = PurgeArenaString(MALLCTL_ARENAS_ALL); - { - lock_guard guard(lock); - SetJemallocCTL(purge_arena.c_str()); - } + SetJemallocCTL(purge_arena.c_str()); // Reset the peak after resetting SetJemallocCTL("thread.peak.reset"); diff --git a/extension/json/json_functions/json_create.cpp b/extension/json/json_functions/json_create.cpp index dcea02b6b81b..3927daa1b87a 100644 --- a/extension/json/json_functions/json_create.cpp +++ b/extension/json/json_functions/json_create.cpp @@ -550,6 +550,7 @@ static void CreateValues(const StructNames &names, yyjson_mut_doc *doc, yyjson_m case LogicalTypeId::TIMESTAMP_NS: case LogicalTypeId::TIMESTAMP_MS: case LogicalTypeId::TIMESTAMP_SEC: + case LogicalTypeId::VARINT: case LogicalTypeId::UUID: { Vector string_vector(LogicalTypeId::VARCHAR, count); VectorOperations::DefaultCast(value_v, string_vector, count); @@ -562,7 +563,17 @@ static void CreateValues(const StructNames &names, yyjson_mut_doc *doc, yyjson_m TemplatedCreateValues(doc, vals, double_vector, count); break; } - default: + case LogicalTypeId::INVALID: + case LogicalTypeId::UNKNOWN: + case LogicalTypeId::ANY: + case LogicalTypeId::USER: + case LogicalTypeId::CHAR: + case LogicalTypeId::STRING_LITERAL: + case LogicalTypeId::INTEGER_LITERAL: + case LogicalTypeId::POINTER: + case LogicalTypeId::VALIDITY: + case LogicalTypeId::TABLE: + case LogicalTypeId::LAMBDA: throw InternalException("Unsupported type arrived at JSON create function"); } } diff --git a/extension/json/json_functions/read_json_objects.cpp b/extension/json/json_functions/read_json_objects.cpp index 46d4e7982f88..7e97b64717b2 100644 --- a/extension/json/json_functions/read_json_objects.cpp +++ b/extension/json/json_functions/read_json_objects.cpp @@ -33,8 +33,9 @@ static void ReadJSONObjectsFunction(ClientContext &context, TableFunctionInput & if (!gstate.names.empty()) { // Create the strings without copying them - auto strings = FlatVector::GetData(output.data[0]); - auto &validity = FlatVector::Validity(output.data[0]); + const auto col_idx = gstate.column_indices[0]; + auto strings = FlatVector::GetData(output.data[col_idx]); + auto &validity = FlatVector::Validity(output.data[col_idx]); for (idx_t i = 0; i < count; i++) { if (objects[i]) { strings[i] = string_t(units[i].pointer, units[i].size); diff --git a/scripts/regression_test_runner.py b/scripts/regression_test_runner.py index b799f6917931..97ef94b16710 100644 --- a/scripts/regression_test_runner.py +++ b/scripts/regression_test_runner.py @@ -44,6 +44,7 @@ def geomean(xs): no_regression_fail = False disable_timeout = False max_timeout = 3600 +root_dir = "" for arg in sys.argv: if arg.startswith("--old="): old_runner = arg.replace("--old=", "") @@ -59,6 +60,8 @@ def geomean(xs): no_regression_fail = True elif arg == "--disable-timeout": disable_timeout = True + elif arg.startswith("--root-dir="): + root_dir = arg.replace("--root-dir=", "") if old_runner is None or new_runner is None or benchmark_file is None: print( @@ -79,6 +82,11 @@ def geomean(xs): def run_benchmark(runner, benchmark): benchmark_args = [runner, benchmark] + + if root_dir: + benchmark_args += [f"--root-dir"] + benchmark_args += [root_dir] + if threads is not None: benchmark_args += ["--threads=%d" % (threads,)] if disable_timeout: diff --git a/scripts/sqllogictest/parser/parser.py b/scripts/sqllogictest/parser/parser.py index f91706d1a9f6..96d9679e58ba 100644 --- a/scripts/sqllogictest/parser/parser.py +++ b/scripts/sqllogictest/parser/parser.py @@ -99,6 +99,62 @@ def __init__(self): "": ["tinyint", "smallint", "integer", "bigint", "hugeint"], "": ["tinyint", "smallint", "integer", "bigint", "hugeint"], "": ["utinyint", "usmallint", "uinteger", "ubigint", "uhugeint"], + "": ["utinyint", "usmallint", "uinteger", "ubigint", "uhugeint"], + "": [ + "bool", + "tinyint", + "smallint", + "int", + "bigint", + "hugeint", + "uhugeint", + "utinyint", + "usmallint", + "uint", + "ubigint", + "date", + "time", + "timestamp", + "timestamp_s", + "timestamp_ms", + "timestamp_ns", + "time_tz", + "timestamp_tz", + "float", + "double", + "dec_4_1", + "dec_9_4", + "dec_18_6", + "dec38_10", + "uuid", + "interval", + "varchar", + "blob", + "bit", + "small_enum", + "medium_enum", + "large_enum", + "int_array", + "double_array", + "date_array", + "timestamp_array", + "timestamptz_array", + "varchar_array", + "nested_int_array", + "struct", + "struct_of_arrays", + "array_of_structs", + "map", + "union", + "fixed_int_array", + "fixed_varchar_array", + "fixed_nested_int_array", + "fixed_nested_varchar_array", + "fixed_struct_array", + "struct_of_fixed_array", + "fixed_array_of_int_list", + "list_of_fixed_int_array", + ], } def peek(self): diff --git a/scripts/sqllogictest/result.py b/scripts/sqllogictest/result.py index 984d01f00c9a..72f3d895257d 100644 --- a/scripts/sqllogictest/result.py +++ b/scripts/sqllogictest/result.py @@ -405,6 +405,8 @@ def __init__( # Now re-open the current database read_only = 'access_mode' in self.config and self.config['access_mode'] == 'read_only' + if 'access_mode' not in self.config: + self.config['access_mode'] = 'automatic' self.database = duckdb.connect(path, read_only, self.config) # Load any previously loaded extensions again @@ -781,7 +783,9 @@ def get_connection(self, name: Optional[str] = None) -> duckdb.DuckDBPyConnectio def execute_load(self, load: Load): if self.in_loop(): - self.fail("load cannot be called in a loop") + # FIXME: should add support for this, the CPP tester supports this + self.skiptest("load cannot be called in a loop") + # self.fail("load cannot be called in a loop") readonly = load.readonly @@ -842,23 +846,26 @@ def is_query_result(sql_query, statement) -> bool: if is_query_result(sql_query, statement): original_rel = conn.query(sql_query) - original_types = original_rel.types - # We create new names for the columns, because they might be duplicated - aliased_columns = [f'c{i}' for i in range(len(original_types))] - - expressions = [f'"{name}"::VARCHAR' for name, sql_type in zip(aliased_columns, original_types)] - aliased_table = ", ".join(aliased_columns) - expression_list = ", ".join(expressions) - try: - # Select from the result, converting the Values to the right type for comparison - transformed_query = ( - f"select {expression_list} from original_rel unnamed_subquery_blabla({aliased_table})" - ) - stringified_rel = conn.query(transformed_query) - except duckdb.Error as e: - self.fail(f"Could not select from the ValueRelation: {str(e)}") - result = stringified_rel.fetchall() - query_result = QueryResult(result, original_types) + if original_rel is None: + query_result = QueryResult([(0,)], ['BIGINT']) + else: + original_types = original_rel.types + # We create new names for the columns, because they might be duplicated + aliased_columns = [f'c{i}' for i in range(len(original_types))] + + expressions = [f'"{name}"::VARCHAR' for name, sql_type in zip(aliased_columns, original_types)] + aliased_table = ", ".join(aliased_columns) + expression_list = ", ".join(expressions) + try: + # Select from the result, converting the Values to the right type for comparison + transformed_query = ( + f"select {expression_list} from original_rel unnamed_subquery_blabla({aliased_table})" + ) + stringified_rel = conn.query(transformed_query) + except duckdb.Error as e: + self.fail(f"Could not select from the ValueRelation: {str(e)}") + result = stringified_rel.fetchall() + query_result = QueryResult(result, original_types) elif duckdb.ExpectedResultType.CHANGED_ROWS in statement.expected_result_type: conn.execute(sql_query) result = conn.fetchall() diff --git a/scripts/test_block_sizes.py b/scripts/test_block_sizes.py deleted file mode 100644 index 5de51e2a92dd..000000000000 --- a/scripts/test_block_sizes.py +++ /dev/null @@ -1,31 +0,0 @@ -import os - - -def execute_system_command(cmd): - print(cmd) - retcode = os.system(cmd) - print(retcode) - if retcode != 0: - raise Exception - - -# run the fast tests and all storage-related tests -# with a block size of 16KB and a standard vector size -block_size = 16384 -print("TESTING BLOCK_ALLOC_SIZE=%d" % (block_size,)) -print("TESTING STANDARD_VECTOR_SIZE") - -execute_system_command('rm -rf build') -execute_system_command(f'BLOCK_ALLOC_SIZE={block_size} make relassert') -execute_system_command('build/relassert/test/unittest') -execute_system_command('build/relassert/test/unittest "test/sql/storage/*"') - -# run the fast tests and all storage-related tests -# with a block size of 16KB and a vector size of 512 -vector_size = 512 -print("TESTING BLOCK_ALLOC_SIZE=%d" % (block_size,)) -print("TESTING STANDARD_VECTOR_SIZE=%d" % (vector_size,)) - -execute_system_command('rm -rf build') -execute_system_command(f'BLOCK_ALLOC_SIZE={block_size} STANDARD_VECTOR_SIZE={vector_size} make release') -execute_system_command('build/release/test/unittest') diff --git a/scripts/test_vector_sizes.py b/scripts/test_vector_sizes.py deleted file mode 100644 index 77ee50c27daa..000000000000 --- a/scripts/test_vector_sizes.py +++ /dev/null @@ -1,21 +0,0 @@ -import os - -vector_sizes = [2] - -current_dir = os.getcwd() -build_dir = os.path.join(os.getcwd(), 'build', 'release') - - -def execute_system_command(cmd): - print(cmd) - retcode = os.system(cmd) - print(retcode) - if retcode != 0: - raise Exception(f"Failed to run command {cmd} - exit code {retcode}") - - -for vector_size in vector_sizes: - print("TESTING STANDARD_VECTOR_SIZE=%d" % (vector_size,)) - execute_system_command('rm -rf build') - execute_system_command(f'STANDARD_VECTOR_SIZE={vector_size} make relassert') - execute_system_command('python3 scripts/run_tests_one_by_one.py build/relassert/test/unittest --no-exit') diff --git a/src/common/arrow/arrow_appender.cpp b/src/common/arrow/arrow_appender.cpp index 0feaaf7177f4..b478fdb34e56 100644 --- a/src/common/arrow/arrow_appender.cpp +++ b/src/common/arrow/arrow_appender.cpp @@ -160,6 +160,16 @@ static void InitializeFunctionPointers(ArrowAppendData &append_data, const Logic InitializeAppenderForType>(append_data); break; case LogicalTypeId::UUID: + if (append_data.options.arrow_lossless_conversion) { + InitializeAppenderForType>(append_data); + } else { + if (append_data.options.arrow_offset_size == ArrowOffsetSize::LARGE) { + InitializeAppenderForType>(append_data); + } else { + InitializeAppenderForType>(append_data); + } + } + break; case LogicalTypeId::HUGEINT: InitializeAppenderForType>(append_data); break; diff --git a/src/common/arrow/arrow_converter.cpp b/src/common/arrow/arrow_converter.cpp index 6e0ea5f31c5b..3524dc89888f 100644 --- a/src/common/arrow/arrow_converter.cpp +++ b/src/common/arrow/arrow_converter.cpp @@ -146,11 +146,23 @@ void SetArrowFormat(DuckDBArrowSchemaHolder &root_holder, ArrowSchema &child, co child.format = "g"; break; case LogicalTypeId::UUID: { - // This is a canonical extension, hence needs the "arrow." prefix - child.format = "w:16"; - auto schema_metadata = ArrowSchemaMetadata::MetadataFromName("arrow.uuid"); - root_holder.metadata_info.emplace_back(schema_metadata.SerializeMetadata()); - child.metadata = root_holder.metadata_info.back().get(); + if (options.arrow_lossless_conversion) { + // This is a canonical extension, hence needs the "arrow." prefix + child.format = "w:16"; + auto schema_metadata = ArrowSchemaMetadata::MetadataFromName("arrow.uuid"); + root_holder.metadata_info.emplace_back(schema_metadata.SerializeMetadata()); + child.metadata = root_holder.metadata_info.back().get(); + } else { + if (options.produce_arrow_string_view) { + child.format = "vu"; + } else { + if (options.arrow_offset_size == ArrowOffsetSize::LARGE) { + child.format = "U"; + } else { + child.format = "u"; + } + } + } break; } case LogicalTypeId::VARCHAR: diff --git a/src/common/cgroups.cpp b/src/common/cgroups.cpp index e565e4731040..b9d2b820b596 100644 --- a/src/common/cgroups.cpp +++ b/src/common/cgroups.cpp @@ -22,6 +22,9 @@ optional_idx CGroups::GetMemoryLimit(FileSystem &fs) { } optional_idx CGroups::GetCGroupV2MemoryLimit(FileSystem &fs) { +#ifdef DUCKDB_WASM + return optional_idx(); +#else const char *cgroup_self = "/proc/self/cgroup"; const char *memory_max = "/sys/fs/cgroup/%s/memory.max"; @@ -42,9 +45,13 @@ optional_idx CGroups::GetCGroupV2MemoryLimit(FileSystem &fs) { } return ReadCGroupValue(fs, memory_max_path); +#endif } optional_idx CGroups::GetCGroupV1MemoryLimit(FileSystem &fs) { +#ifdef DUCKDB_WASM + return optional_idx(); +#else const char *cgroup_self = "/proc/self/cgroup"; const char *memory_limit = "/sys/fs/cgroup/memory/%s/memory.limit_in_bytes"; @@ -65,9 +72,13 @@ optional_idx CGroups::GetCGroupV1MemoryLimit(FileSystem &fs) { } return ReadCGroupValue(fs, memory_limit_path); +#endif } string CGroups::ReadCGroupPath(FileSystem &fs, const char *cgroup_file) { +#ifdef DUCKDB_WASM + return ""; +#else auto handle = fs.OpenFile(cgroup_file, FileFlags::FILE_FLAGS_READ); char buffer[1024]; auto bytes_read = fs.Read(*handle, buffer, sizeof(buffer) - 1); @@ -81,9 +92,13 @@ string CGroups::ReadCGroupPath(FileSystem &fs, const char *cgroup_file) { } return ""; +#endif } string CGroups::ReadMemoryCGroupPath(FileSystem &fs, const char *cgroup_file) { +#ifdef DUCKDB_WASM + return ""; +#else auto handle = fs.OpenFile(cgroup_file, FileFlags::FILE_FLAGS_READ); char buffer[1024]; auto bytes_read = fs.Read(*handle, buffer, sizeof(buffer) - 1); @@ -102,9 +117,13 @@ string CGroups::ReadMemoryCGroupPath(FileSystem &fs, const char *cgroup_file) { } return ""; +#endif } optional_idx CGroups::ReadCGroupValue(FileSystem &fs, const char *file_path) { +#ifdef DUCKDB_WASM + return optional_idx(); +#else auto handle = fs.OpenFile(file_path, FileFlags::FILE_FLAGS_READ); char buffer[100]; auto bytes_read = fs.Read(*handle, buffer, 99); @@ -115,9 +134,14 @@ optional_idx CGroups::ReadCGroupValue(FileSystem &fs, const char *file_path) { return optional_idx(value); } return optional_idx(); +#endif } idx_t CGroups::GetCPULimit(FileSystem &fs, idx_t physical_cores) { +#ifdef DUCKDB_WASM + return physical_cores; +#else + static constexpr const char *cpu_max = "/sys/fs/cgroup/cpu.max"; static constexpr const char *cfs_quota = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"; static constexpr const char *cfs_period = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"; @@ -159,6 +183,7 @@ idx_t CGroups::GetCPULimit(FileSystem &fs, idx_t physical_cores) { } else { return physical_cores; } +#endif } } // namespace duckdb diff --git a/src/common/enums/file_compression_type.cpp b/src/common/enums/file_compression_type.cpp index 5df6add19cb3..44066f32c743 100644 --- a/src/common/enums/file_compression_type.cpp +++ b/src/common/enums/file_compression_type.cpp @@ -19,4 +19,28 @@ FileCompressionType FileCompressionTypeFromString(const string &input) { } } +string CompressionExtensionFromType(const FileCompressionType type) { + switch (type) { + case FileCompressionType::GZIP: + return ".gz"; + case FileCompressionType::ZSTD: + return ".zst"; + default: + throw NotImplementedException("Compression Extension of file compression type is not implemented"); + } +} + +bool IsFileCompressed(string path, FileCompressionType type) { + auto extension = CompressionExtensionFromType(type); + std::size_t question_mark_pos = std::string::npos; + if (!StringUtil::StartsWith(path, "\\\\?\\")) { + question_mark_pos = path.find('?'); + } + path = path.substr(0, question_mark_pos); + if (StringUtil::EndsWith(path, extension)) { + return true; + } + return false; +} + } // namespace duckdb diff --git a/src/common/tree_renderer/text_tree_renderer.cpp b/src/common/tree_renderer/text_tree_renderer.cpp index 5b057d65ea47..8e0fa4253c91 100644 --- a/src/common/tree_renderer/text_tree_renderer.cpp +++ b/src/common/tree_renderer/text_tree_renderer.cpp @@ -65,17 +65,20 @@ static bool NodeHasMultipleChildren(RenderTreeNode &node) { } static bool ShouldRenderWhitespace(RenderTree &root, idx_t x, idx_t y) { + idx_t found_children = 0; for (;; x--) { auto node = root.GetNode(x, y); + if (root.HasNode(x, y + 1)) { + found_children++; + } if (node) { if (NodeHasMultipleChildren(*node)) { - return true; + if (found_children < node->child_positions.size()) { + return true; + } } return false; } - if (root.HasNode(x, y + 1)) { - break; - } if (x == 0) { break; } @@ -190,11 +193,12 @@ void TextTreeRenderer::RenderBoxContent(RenderTree &root, std::ostream &ss, idx_ if (root.HasNode(x, y + 1)) { // node right below this one ss << StringUtil::Repeat(config.HORIZONTAL, config.node_render_width / 2); - ss << config.RTCORNER; if (has_child_to_the_right) { + ss << config.TMIDDLE; // but we have another child to the right! keep rendering the line ss << StringUtil::Repeat(config.HORIZONTAL, config.node_render_width / 2); } else { + ss << config.RTCORNER; if (has_adjacent_nodes) { // only a child below this one: fill the rest with spaces ss << StringUtil::Repeat(" ", config.node_render_width / 2); diff --git a/src/common/types/row/tuple_data_collection.cpp b/src/common/types/row/tuple_data_collection.cpp index 1df776acb5d7..a5215d0302eb 100644 --- a/src/common/types/row/tuple_data_collection.cpp +++ b/src/common/types/row/tuple_data_collection.cpp @@ -536,14 +536,18 @@ void TupleDataCollection::ScanAtIndex(TupleDataPinState &pin_state, TupleDataChu segment.allocator->InitializeChunkState(segment, pin_state, chunk_state, chunk_index, false); result.Reset(); + ResetCachedCastVectors(chunk_state, column_ids); + Gather(chunk_state.row_locations, *FlatVector::IncrementalSelectionVector(), chunk.count, column_ids, result, + *FlatVector::IncrementalSelectionVector(), chunk_state.cached_cast_vectors); + result.SetCardinality(chunk.count); +} + +void TupleDataCollection::ResetCachedCastVectors(TupleDataChunkState &chunk_state, const vector &column_ids) { for (idx_t i = 0; i < column_ids.size(); i++) { if (chunk_state.cached_cast_vectors[i]) { chunk_state.cached_cast_vectors[i]->ResetFromCache(*chunk_state.cached_cast_vector_cache[i]); } } - Gather(chunk_state.row_locations, *FlatVector::IncrementalSelectionVector(), chunk.count, column_ids, result, - *FlatVector::IncrementalSelectionVector(), chunk_state.cached_cast_vectors); - result.SetCardinality(chunk.count); } // LCOV_EXCL_START diff --git a/src/common/types/row/tuple_data_scatter_gather.cpp b/src/common/types/row/tuple_data_scatter_gather.cpp index e707941d0ea6..fb9e8a6d1d0a 100644 --- a/src/common/types/row/tuple_data_scatter_gather.cpp +++ b/src/common/types/row/tuple_data_scatter_gather.cpp @@ -1100,6 +1100,7 @@ void TupleDataCollection::Gather(Vector &row_locations, const SelectionVector &s void TupleDataCollection::Gather(Vector &row_locations, const SelectionVector &scan_sel, const idx_t scan_count, const column_t column_id, Vector &result, const SelectionVector &target_sel, optional_ptr cached_cast_vector) const { + D_ASSERT(!cached_cast_vector || FlatVector::Validity(*cached_cast_vector).AllValid()); // ResetCachedCastVectors const auto &gather_function = gather_functions[column_id]; gather_function.function(layout, row_locations, column_id, scan_sel, scan_count, result, target_sel, cached_cast_vector, gather_function.child_functions); diff --git a/src/common/types/varint.cpp b/src/common/types/varint.cpp index 2a5fe3b8ffd8..121b9a3cd861 100644 --- a/src/common/types/varint.cpp +++ b/src/common/types/varint.cpp @@ -120,7 +120,7 @@ bool Varint::VarcharFormatting(const string_t &value, idx_t &start_pos, idx_t &e is_zero = true; return true; } - // This is either a '+' or '-'. Hence invalid. + // This is either a '+' or '-'. Hence, invalid. return false; } idx_t cur_pos = start_pos; @@ -262,9 +262,8 @@ string Varint::VarcharToVarInt(const string_t &value) { return result; } -bool Varint::VarintToDouble(string_t &blob, double &result, bool &strict) { +bool Varint::VarintToDouble(const string_t &blob, double &result, bool &strict) { result = 0; - bool is_negative; if (blob.GetSize() < 4) { throw InvalidInputException("Invalid blob size."); @@ -272,7 +271,7 @@ bool Varint::VarintToDouble(string_t &blob, double &result, bool &strict) { auto blob_ptr = blob.GetData(); // Determine if the number is negative - is_negative = (blob_ptr[0] & 0x80) == 0; + bool is_negative = (blob_ptr[0] & 0x80) == 0; idx_t byte_pos = 0; for (idx_t i = blob.GetSize() - 1; i > 2; i--) { if (is_negative) { @@ -286,7 +285,11 @@ bool Varint::VarintToDouble(string_t &blob, double &result, bool &strict) { if (is_negative) { result *= -1; } - return std::isfinite(result); + if (!std::isfinite(result)) { + // We throw an error + throw ConversionException("Could not convert varint '%s' to Double", VarIntToVarchar(blob)); + } + return true; } } // namespace duckdb diff --git a/src/common/virtual_file_system.cpp b/src/common/virtual_file_system.cpp index 3bc099a2bd01..74892a4e0590 100644 --- a/src/common/virtual_file_system.cpp +++ b/src/common/virtual_file_system.cpp @@ -13,15 +13,15 @@ unique_ptr VirtualFileSystem::OpenFile(const string &path, FileOpenF optional_ptr opener) { auto compression = flags.Compression(); if (compression == FileCompressionType::AUTO_DETECT) { - // auto detect compression settings based on file name + // auto-detect compression settings based on file name auto lower_path = StringUtil::Lower(path); if (StringUtil::EndsWith(lower_path, ".tmp")) { // strip .tmp lower_path = lower_path.substr(0, lower_path.length() - 4); } - if (StringUtil::EndsWith(lower_path, ".gz")) { + if (IsFileCompressed(path, FileCompressionType::GZIP)) { compression = FileCompressionType::GZIP; - } else if (StringUtil::EndsWith(lower_path, ".zst")) { + } else if (IsFileCompressed(path, FileCompressionType::ZSTD)) { compression = FileCompressionType::ZSTD; } else { compression = FileCompressionType::UNCOMPRESSED; diff --git a/src/core_functions/function_list.cpp b/src/core_functions/function_list.cpp index 3a881845d128..c01d3e853e9b 100644 --- a/src/core_functions/function_list.cpp +++ b/src/core_functions/function_list.cpp @@ -52,10 +52,10 @@ static const StaticFunctionDefinition internal_functions[] = { DUCKDB_SCALAR_FUNCTION_SET(BitwiseAndFun), DUCKDB_SCALAR_FUNCTION_ALIAS(ListHasAnyFunAlias), DUCKDB_SCALAR_FUNCTION(PowOperatorFun), - DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListInnerProductFunAlias), + DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListNegativeInnerProductFunAlias), DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListDistanceFunAlias), DUCKDB_SCALAR_FUNCTION_SET(LeftShiftFun), - DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListCosineSimilarityFunAlias), + DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListCosineDistanceFunAlias), DUCKDB_SCALAR_FUNCTION_ALIAS(ListHasAllFunAlias2), DUCKDB_SCALAR_FUNCTION_SET(RightShiftFun), DUCKDB_SCALAR_FUNCTION_SET(AbsOperatorFun), @@ -82,6 +82,7 @@ static const StaticFunctionDefinition internal_functions[] = { DUCKDB_SCALAR_FUNCTION_ALIAS(ArrayAggrFun), DUCKDB_SCALAR_FUNCTION_ALIAS(ArrayAggregateFun), DUCKDB_SCALAR_FUNCTION_ALIAS(ArrayApplyFun), + DUCKDB_SCALAR_FUNCTION_SET(ArrayCosineDistanceFun), DUCKDB_SCALAR_FUNCTION_SET(ArrayCosineSimilarityFun), DUCKDB_SCALAR_FUNCTION_SET(ArrayCrossProductFun), DUCKDB_SCALAR_FUNCTION_SET(ArrayDistanceFun), @@ -92,6 +93,8 @@ static const StaticFunctionDefinition internal_functions[] = { DUCKDB_SCALAR_FUNCTION_ALIAS(ArrayHasAllFun), DUCKDB_SCALAR_FUNCTION_ALIAS(ArrayHasAnyFun), DUCKDB_SCALAR_FUNCTION_SET(ArrayInnerProductFun), + DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ArrayNegativeDotProductFun), + DUCKDB_SCALAR_FUNCTION_SET(ArrayNegativeInnerProductFun), DUCKDB_SCALAR_FUNCTION_ALIAS(ArrayReduceFun), DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ArrayReverseSortFun), DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ArraySliceFun), @@ -230,6 +233,7 @@ static const StaticFunctionDefinition internal_functions[] = { DUCKDB_SCALAR_FUNCTION_ALIAS(ListAggrFun), DUCKDB_SCALAR_FUNCTION(ListAggregateFun), DUCKDB_SCALAR_FUNCTION_ALIAS(ListApplyFun), + DUCKDB_SCALAR_FUNCTION_SET(ListCosineDistanceFun), DUCKDB_SCALAR_FUNCTION_SET(ListCosineSimilarityFun), DUCKDB_SCALAR_FUNCTION_SET(ListDistanceFun), DUCKDB_SCALAR_FUNCTION(ListDistinctFun), @@ -239,6 +243,8 @@ static const StaticFunctionDefinition internal_functions[] = { DUCKDB_SCALAR_FUNCTION(ListHasAllFun), DUCKDB_SCALAR_FUNCTION(ListHasAnyFun), DUCKDB_SCALAR_FUNCTION_SET(ListInnerProductFun), + DUCKDB_SCALAR_FUNCTION_SET_ALIAS(ListNegativeDotProductFun), + DUCKDB_SCALAR_FUNCTION_SET(ListNegativeInnerProductFun), DUCKDB_SCALAR_FUNCTION_ALIAS(ListPackFun), DUCKDB_SCALAR_FUNCTION(ListReduceFun), DUCKDB_SCALAR_FUNCTION_SET(ListReverseSortFun), diff --git a/src/core_functions/scalar/array/array_functions.cpp b/src/core_functions/scalar/array/array_functions.cpp index 324de46a4b0a..347ffcbd1031 100644 --- a/src/core_functions/scalar/array/array_functions.cpp +++ b/src/core_functions/scalar/array/array_functions.cpp @@ -1,182 +1,135 @@ #include "duckdb/core_functions/scalar/array_functions.hpp" -#include +#include "duckdb/core_functions/array_kernels.hpp" +#include "duckdb/planner/expression/bound_function_expression.hpp" namespace duckdb { -//------------------------------------------------------------------------------ -// Functors -//------------------------------------------------------------------------------ +static unique_ptr ArrayGenericBinaryBind(ClientContext &context, ScalarFunction &bound_function, + vector> &arguments) { -struct InnerProductOp { - static constexpr const char *NAME = "array_inner_product"; + const auto lhs_is_param = arguments[0]->HasParameter(); + const auto rhs_is_param = arguments[1]->HasParameter(); - template - inline static TYPE *GetResultData(Vector &result_vec) { - return FlatVector::GetData(result_vec); + if (lhs_is_param && rhs_is_param) { + throw ParameterNotResolvedException(); } - template - inline static void Operation(TYPE *l_data, idx_t l_idx, TYPE *r_data, idx_t r_idx, TYPE *result_data, - idx_t result_idx, idx_t size) { + const auto &lhs_type = arguments[0]->return_type; + const auto &rhs_type = arguments[1]->return_type; - TYPE inner_product = 0; + bound_function.arguments[0] = lhs_is_param ? rhs_type : lhs_type; + bound_function.arguments[1] = rhs_is_param ? lhs_type : rhs_type; - auto l_ptr = l_data + (l_idx * size); - auto r_ptr = r_data + (r_idx * size); - - for (idx_t elem_idx = 0; elem_idx < size; elem_idx++) { - auto x = *l_ptr++; - auto y = *r_ptr++; - inner_product += x * y; - } - - result_data[result_idx] = inner_product; + if (bound_function.arguments[0].id() != LogicalTypeId::ARRAY || + bound_function.arguments[1].id() != LogicalTypeId::ARRAY) { + throw InvalidInputException( + StringUtil::Format("%s: Arguments must be arrays of FLOAT or DOUBLE", bound_function.name)); } -}; -struct DistanceOp { - static constexpr const char *NAME = "array_distance"; + const auto lhs_size = ArrayType::GetSize(bound_function.arguments[0]); + const auto rhs_size = ArrayType::GetSize(bound_function.arguments[1]); - template - inline static TYPE *GetResultData(Vector &result_vec) { - return FlatVector::GetData(result_vec); + if (lhs_size != rhs_size) { + throw BinderException("%s: Array arguments must be of the same size", bound_function.name); } - template - inline static void Operation(TYPE *l_data, idx_t l_idx, TYPE *r_data, idx_t r_idx, TYPE *result_data, - idx_t result_idx, idx_t size) { - - TYPE distance = 0; - - auto l_ptr = l_data + (l_idx * size); - auto r_ptr = r_data + (r_idx * size); - - for (idx_t elem_idx = 0; elem_idx < size; elem_idx++) { - auto x = *l_ptr++; - auto y = *r_ptr++; - auto diff = x - y; - distance += diff * diff; - } + const auto &lhs_element_type = ArrayType::GetChildType(bound_function.arguments[0]); + const auto &rhs_element_type = ArrayType::GetChildType(bound_function.arguments[1]); - result_data[result_idx] = std::sqrt(distance); + // Resolve common type + LogicalType common_type; + if (!LogicalType::TryGetMaxLogicalType(context, lhs_element_type, rhs_element_type, common_type)) { + throw BinderException("%s: Cannot infer common element type (left = '%s', right = '%s')", bound_function.name, + lhs_element_type.ToString(), rhs_element_type.ToString()); } -}; - -struct CosineSimilarityOp { - static constexpr const char *NAME = "array_cosine_similarity"; - template - inline static TYPE *GetResultData(Vector &result_vec) { - return FlatVector::GetData(result_vec); + // Ensure it is float or double + if (common_type.id() != LogicalTypeId::FLOAT && common_type.id() != LogicalTypeId::DOUBLE) { + throw BinderException("%s: Arguments must be arrays of FLOAT or DOUBLE", bound_function.name); } - template - inline static void Operation(TYPE *l_data, idx_t l_idx, TYPE *r_data, idx_t r_idx, TYPE *result_data, - idx_t result_idx, idx_t size) { - - TYPE distance = 0; - TYPE norm_l = 0; - TYPE norm_r = 0; - - auto l_ptr = l_data + (l_idx * size); - auto r_ptr = r_data + (r_idx * size); - - for (idx_t i = 0; i < size; i++) { - auto x = *l_ptr++; - auto y = *r_ptr++; - distance += x * y; - norm_l += x * x; - norm_r += y * y; - } + // The important part is just that we resolve the size of the input arrays + bound_function.arguments[0] = LogicalType::ARRAY(common_type, lhs_size); + bound_function.arguments[1] = LogicalType::ARRAY(common_type, rhs_size); - auto similarity = distance / (std::sqrt(norm_l) * std::sqrt(norm_r)); + return nullptr; +} - // clamp to [-1, 1] to avoid floating point errors - result_data[result_idx] = std::max(static_cast(-1), std::min(similarity, static_cast(1))); - } -}; +//------------------------------------------------------------------------------ +// Element-wise combine functions +//------------------------------------------------------------------------------ +// Given two arrays of the same size, combine their elements into a single array +// of the same size as the input arrays. struct CrossProductOp { - static constexpr const char *NAME = "array_cross_product"; - template - inline static TYPE *GetResultData(Vector &result_vec) { - // Since we return an array here, we need to get the data pointer of the child - auto &child = ArrayVector::GetEntry(result_vec); - return FlatVector::GetData(child); - } - - template - inline static void Operation(TYPE *l_data, idx_t l_idx, TYPE *r_data, idx_t r_idx, TYPE *result_data, - idx_t result_idx, idx_t size) { + static void Operation(const TYPE *lhs_data, const TYPE *rhs_data, TYPE *res_data, idx_t size) { D_ASSERT(size == 3); - auto l_child_idx = l_idx * size; - auto r_child_idx = r_idx * size; - auto res_child_idx = result_idx * size; - - auto lx = l_data[l_child_idx + 0]; - auto ly = l_data[l_child_idx + 1]; - auto lz = l_data[l_child_idx + 2]; + auto lx = lhs_data[0]; + auto ly = lhs_data[1]; + auto lz = lhs_data[2]; - auto rx = r_data[r_child_idx + 0]; - auto ry = r_data[r_child_idx + 1]; - auto rz = r_data[r_child_idx + 2]; + auto rx = rhs_data[0]; + auto ry = rhs_data[1]; + auto rz = rhs_data[2]; - result_data[res_child_idx + 0] = ly * rz - lz * ry; - result_data[res_child_idx + 1] = lz * rx - lx * rz; - result_data[res_child_idx + 2] = lx * ry - ly * rx; + res_data[0] = ly * rz - lz * ry; + res_data[1] = lz * rx - lx * rz; + res_data[2] = lx * ry - ly * rx; } }; -//------------------------------------------------------------------------------ -// Generic Execute and Bind -//------------------------------------------------------------------------------ -// This is a generic executor function for fast binary math operations on -// real-valued arrays. Array elements are assumed to be either FLOAT or DOUBLE, -// and cannot be null. (although the array itself can be null). -// In the future we could extend this further to be truly generic and handle -// other types, unary/ternary operations and/or nulls. - -template -static inline void ArrayGenericBinaryExecute(Vector &left, Vector &right, Vector &result, idx_t size, idx_t count) { +template +static void ArrayFixedCombine(DataChunk &args, ExpressionState &state, Vector &result) { + const auto &lstate = state.Cast(); + const auto &expr = lstate.expr.Cast(); + const auto &func_name = expr.function.name; - auto &left_child = ArrayVector::GetEntry(left); - auto &right_child = ArrayVector::GetEntry(right); + const auto count = args.size(); + auto &lhs_child = ArrayVector::GetEntry(args.data[0]); + auto &rhs_child = ArrayVector::GetEntry(args.data[1]); + auto &res_child = ArrayVector::GetEntry(result); - auto &left_child_validity = FlatVector::Validity(left_child); - auto &right_child_validity = FlatVector::Validity(right_child); + const auto &lhs_child_validity = FlatVector::Validity(lhs_child); + const auto &rhs_child_validity = FlatVector::Validity(rhs_child); - UnifiedVectorFormat left_format; - UnifiedVectorFormat right_format; + UnifiedVectorFormat lhs_format; + UnifiedVectorFormat rhs_format; - left.ToUnifiedFormat(count, left_format); - right.ToUnifiedFormat(count, right_format); + args.data[0].ToUnifiedFormat(count, lhs_format); + args.data[1].ToUnifiedFormat(count, rhs_format); - auto left_data = FlatVector::GetData(left_child); - auto right_data = FlatVector::GetData(right_child); - auto result_data = OP::template GetResultData(result); + auto lhs_data = FlatVector::GetData(lhs_child); + auto rhs_data = FlatVector::GetData(rhs_child); + auto res_data = FlatVector::GetData(res_child); for (idx_t i = 0; i < count; i++) { - auto left_idx = left_format.sel->get_index(i); - auto right_idx = right_format.sel->get_index(i); + const auto lhs_idx = lhs_format.sel->get_index(i); + const auto rhs_idx = rhs_format.sel->get_index(i); - if (!left_format.validity.RowIsValid(left_idx) || !right_format.validity.RowIsValid(right_idx)) { + if (!lhs_format.validity.RowIsValid(lhs_idx) || !rhs_format.validity.RowIsValid(rhs_idx)) { FlatVector::SetNull(result, i, true); continue; } - auto left_offset = left_idx * size; - if (!left_child_validity.CheckAllValid(left_offset + size, left_offset)) { - throw InvalidInputException(StringUtil::Format("%s: left argument can not contain NULL values", OP::NAME)); + const auto left_offset = lhs_idx * N; + if (!lhs_child_validity.CheckAllValid(left_offset + N, left_offset)) { + throw InvalidInputException(StringUtil::Format("%s: left argument can not contain NULL values", func_name)); } - auto right_offset = right_idx * size; - if (!right_child_validity.CheckAllValid(right_offset + size, right_offset)) { - throw InvalidInputException(StringUtil::Format("%s: right argument can not contain NULL values", OP::NAME)); + const auto right_offset = rhs_idx * N; + if (!rhs_child_validity.CheckAllValid(right_offset + N, right_offset)) { + throw InvalidInputException( + StringUtil::Format("%s: right argument can not contain NULL values", func_name)); } + const auto result_offset = i * N; - OP::template Operation(left_data, left_idx, right_data, right_idx, result_data, i, size); + const auto lhs_data_ptr = lhs_data + left_offset; + const auto rhs_data_ptr = rhs_data + right_offset; + const auto res_data_ptr = res_data + result_offset; + + OP::Operation(lhs_data_ptr, rhs_data_ptr, res_data_ptr, N); } if (count == 1) { @@ -184,101 +137,123 @@ static inline void ArrayGenericBinaryExecute(Vector &left, Vector &right, Vector } } -template -static void ArrayGenericBinaryFunction(DataChunk &args, ExpressionState &, Vector &result) { - auto size = ArrayType::GetSize(args.data[0].GetType()); - auto child_type = ArrayType::GetChildType(args.data[0].GetType()); - switch (child_type.id()) { - case LogicalTypeId::DOUBLE: - ArrayGenericBinaryExecute(args.data[0], args.data[1], result, size, args.size()); - break; - case LogicalTypeId::FLOAT: - ArrayGenericBinaryExecute(args.data[0], args.data[1], result, size, args.size()); - break; - default: - throw NotImplementedException(StringUtil::Format("%s: Unsupported element type", OP::NAME)); - } -} +//------------------------------------------------------------------------------ +// Generic "fold" function +//------------------------------------------------------------------------------ +// Given two arrays, combine and reduce their elements into a single scalar value. -template -static unique_ptr ArrayGenericBinaryBind(ClientContext &context, ScalarFunction &bound_function, - vector> &arguments) { +template +static void ArrayGenericFold(DataChunk &args, ExpressionState &state, Vector &result) { + const auto &lstate = state.Cast(); + const auto &expr = lstate.expr.Cast(); + const auto &func_name = expr.function.name; - // construct return type - auto &left_type = arguments[0]->return_type; - auto &right_type = arguments[1]->return_type; + const auto count = args.size(); + auto &lhs_child = ArrayVector::GetEntry(args.data[0]); + auto &rhs_child = ArrayVector::GetEntry(args.data[1]); - // mystery to me how anything non-array could ever end up here but it happened - if (left_type.id() != LogicalTypeId::ARRAY || right_type.id() != LogicalTypeId::ARRAY) { - throw InvalidInputException(StringUtil::Format("%s: Arguments must be arrays of FLOAT or DOUBLE", OP::NAME)); - } + const auto &lhs_child_validity = FlatVector::Validity(lhs_child); + const auto &rhs_child_validity = FlatVector::Validity(rhs_child); - auto left_size = ArrayType::GetSize(left_type); - auto right_size = ArrayType::GetSize(right_type); - if (left_size != right_size) { - throw InvalidInputException(StringUtil::Format("%s: Array arguments must be of the same size", OP::NAME)); - } - auto size = left_size; + UnifiedVectorFormat lhs_format; + UnifiedVectorFormat rhs_format; - auto child_type = - LogicalType::MaxLogicalType(context, ArrayType::GetChildType(left_type), ArrayType::GetChildType(right_type)); - if (child_type != LogicalTypeId::FLOAT && child_type != LogicalTypeId::DOUBLE) { - throw InvalidInputException( - StringUtil::Format("%s: Array arguments must be of type FLOAT or DOUBLE", OP::NAME)); - } + args.data[0].ToUnifiedFormat(count, lhs_format); + args.data[1].ToUnifiedFormat(count, rhs_format); - // the important part here is that we resolve the array size - auto array_type = LogicalType::ARRAY(child_type, size); + auto lhs_data = FlatVector::GetData(lhs_child); + auto rhs_data = FlatVector::GetData(rhs_child); + auto res_data = FlatVector::GetData(result); - bound_function.arguments[0] = array_type; - bound_function.arguments[1] = array_type; - bound_function.return_type = child_type; + const auto array_size = ArrayType::GetSize(args.data[0].GetType()); + D_ASSERT(array_size == ArrayType::GetSize(args.data[1].GetType())); - return nullptr; -} + for (idx_t i = 0; i < count; i++) { + const auto lhs_idx = lhs_format.sel->get_index(i); + const auto rhs_idx = rhs_format.sel->get_index(i); + + if (!lhs_format.validity.RowIsValid(lhs_idx) || !rhs_format.validity.RowIsValid(rhs_idx)) { + FlatVector::SetNull(result, i, true); + continue; + } + + const auto left_offset = lhs_idx * array_size; + if (!lhs_child_validity.CheckAllValid(left_offset + array_size, left_offset)) { + throw InvalidInputException(StringUtil::Format("%s: left argument can not contain NULL values", func_name)); + } + + const auto right_offset = rhs_idx * array_size; + if (!rhs_child_validity.CheckAllValid(right_offset + array_size, right_offset)) { + throw InvalidInputException( + StringUtil::Format("%s: right argument can not contain NULL values", func_name)); + } -template -static inline void ArrayFixedBinaryFunction(DataChunk &args, ExpressionState &, Vector &result) { - ArrayGenericBinaryExecute(args.data[0], args.data[1], result, N, args.size()); + const auto lhs_data_ptr = lhs_data + left_offset; + const auto rhs_data_ptr = rhs_data + right_offset; + + res_data[i] = OP::Operation(lhs_data_ptr, rhs_data_ptr, array_size); + } + + if (count == 1) { + result.SetVectorType(VectorType::CONSTANT_VECTOR); + } } //------------------------------------------------------------------------------ // Function Registration //------------------------------------------------------------------------------ - // Note: In the future we could add a wrapper with a non-type template parameter to specialize for specific array sizes // e.g. 256, 512, 1024, 2048 etc. which may allow the compiler to vectorize the loop better. Perhaps something for an // extension. +template +static void AddArrayFoldFunction(ScalarFunctionSet &set, const LogicalType &type) { + const auto array = LogicalType::ARRAY(type, optional_idx()); + if (type.id() == LogicalTypeId::FLOAT) { + set.AddFunction(ScalarFunction({array, array}, type, ArrayGenericFold, ArrayGenericBinaryBind)); + } else if (type.id() == LogicalTypeId::DOUBLE) { + set.AddFunction(ScalarFunction({array, array}, type, ArrayGenericFold, ArrayGenericBinaryBind)); + } else { + throw NotImplementedException("Array function not implemented for type %s", type.ToString()); + } +} + +ScalarFunctionSet ArrayDistanceFun::GetFunctions() { + ScalarFunctionSet set("array_distance"); + for (auto &type : LogicalType::Real()) { + AddArrayFoldFunction(set, type); + } + return set; +} + ScalarFunctionSet ArrayInnerProductFun::GetFunctions() { ScalarFunctionSet set("array_inner_product"); - // Generic array inner product function for (auto &type : LogicalType::Real()) { - set.AddFunction( - ScalarFunction({LogicalType::ARRAY(type, optional_idx()), LogicalType::ARRAY(type, optional_idx())}, type, - ArrayGenericBinaryFunction, ArrayGenericBinaryBind)); + AddArrayFoldFunction(set, type); } return set; } -ScalarFunctionSet ArrayDistanceFun::GetFunctions() { - ScalarFunctionSet set("array_distance"); - // Generic array distance function +ScalarFunctionSet ArrayNegativeInnerProductFun::GetFunctions() { + ScalarFunctionSet set("array_negative_inner_product"); for (auto &type : LogicalType::Real()) { - set.AddFunction( - ScalarFunction({LogicalType::ARRAY(type, optional_idx()), LogicalType::ARRAY(type, optional_idx())}, type, - ArrayGenericBinaryFunction, ArrayGenericBinaryBind)); + AddArrayFoldFunction(set, type); } return set; } ScalarFunctionSet ArrayCosineSimilarityFun::GetFunctions() { ScalarFunctionSet set("array_cosine_similarity"); - // Generic array cosine similarity function for (auto &type : LogicalType::Real()) { - set.AddFunction( - ScalarFunction({LogicalType::ARRAY(type, optional_idx()), LogicalType::ARRAY(type, optional_idx())}, type, - ArrayGenericBinaryFunction, ArrayGenericBinaryBind)); + AddArrayFoldFunction(set, type); + } + return set; +} + +ScalarFunctionSet ArrayCosineDistanceFun::GetFunctions() { + ScalarFunctionSet set("array_cosine_distance"); + for (auto &type : LogicalType::Real()) { + AddArrayFoldFunction(set, type); } return set; } @@ -286,14 +261,12 @@ ScalarFunctionSet ArrayCosineSimilarityFun::GetFunctions() { ScalarFunctionSet ArrayCrossProductFun::GetFunctions() { ScalarFunctionSet set("array_cross_product"); - // Generic array cross product function - auto double_arr = LogicalType::ARRAY(LogicalType::DOUBLE, 3); + auto float_array = LogicalType::ARRAY(LogicalType::FLOAT, 3); + auto double_array = LogicalType::ARRAY(LogicalType::DOUBLE, 3); set.AddFunction( - ScalarFunction({double_arr, double_arr}, double_arr, ArrayFixedBinaryFunction)); - - auto float_arr = LogicalType::ARRAY(LogicalType::FLOAT, 3); + ScalarFunction({float_array, float_array}, float_array, ArrayFixedCombine)); set.AddFunction( - ScalarFunction({float_arr, float_arr}, float_arr, ArrayFixedBinaryFunction)); + ScalarFunction({double_array, double_array}, double_array, ArrayFixedCombine)); return set; } diff --git a/src/core_functions/scalar/array/functions.json b/src/core_functions/scalar/array/functions.json index 4a0198dc8e73..d09627bedffd 100644 --- a/src/core_functions/scalar/array/functions.json +++ b/src/core_functions/scalar/array/functions.json @@ -20,6 +20,13 @@ "example": "array_cosine_similarity([1, 2, 3], [1, 2, 3])", "type": "scalar_function_set" }, + { + "name": "array_cosine_distance", + "parameters": "array1,array2", + "description": "Compute the cosine distance between two arrays of the same size. The array elements can not be NULL. The arrays can have any size as long as the size is the same for both arguments.", + "example": "array_cosine_distance([1, 2, 3], [1, 2, 3])", + "type": "scalar_function_set" + }, { "name": "array_distance", "parameters": "array1,array2", @@ -34,5 +41,13 @@ "example": "array_inner_product([1, 2, 3], [1, 2, 3])", "type": "scalar_function_set", "aliases": ["array_dot_product"] + }, + { + "name": "array_negative_inner_product", + "parameters": "array1,array2", + "description": "Compute the negative inner product between two arrays of the same size. The array elements can not be NULL. The arrays can have any size as long as the size is the same for both arguments.", + "example": "array_negative_inner_product([1, 2, 3], [1, 2, 3])", + "type": "scalar_function_set", + "aliases": ["array_negative_dot_product"] } ] diff --git a/src/core_functions/scalar/list/CMakeLists.txt b/src/core_functions/scalar/list/CMakeLists.txt index c6819a3f7b55..c5dc44fbc379 100644 --- a/src/core_functions/scalar/list/CMakeLists.txt +++ b/src/core_functions/scalar/list/CMakeLists.txt @@ -8,8 +8,6 @@ add_library_unity( list_has_any_or_all.cpp list_sort.cpp list_distance.cpp - list_cosine_similarity.cpp - list_inner_product.cpp list_reduce.cpp list_transform.cpp list_value.cpp diff --git a/src/core_functions/scalar/list/functions.json b/src/core_functions/scalar/list/functions.json index 326e297c6d9e..0f44496120c7 100644 --- a/src/core_functions/scalar/list/functions.json +++ b/src/core_functions/scalar/list/functions.json @@ -110,13 +110,20 @@ "type": "scalar_function_set", "struct": "ListRangeFun" }, + { + "name": "list_cosine_distance", + "parameters": "list1,list2", + "description": "Compute the cosine distance between two lists", + "example": "list_cosine_distance([1, 2, 3], [1, 2, 3])", + "type": "scalar_function_set", + "aliases": ["<=>"] + }, { "name": "list_cosine_similarity", "parameters": "list1,list2", "description": "Compute the cosine similarity between two lists", "example": "list_cosine_similarity([1, 2, 3], [1, 2, 3])", - "type": "scalar_function_set", - "aliases": ["<=>"] + "type": "scalar_function_set" }, { "name": "list_distance", @@ -132,7 +139,15 @@ "description": "Compute the inner product between two lists", "example": "list_inner_product([1, 2, 3], [1, 2, 3])", "type": "scalar_function_set", - "aliases": ["list_dot_product", "<#>"] + "aliases": ["list_dot_product"] + }, + { + "name": "list_negative_inner_product", + "parameters": "list1,list2", + "description": "Compute the negative inner product between two lists", + "example": "list_negative_inner_product([1, 2, 3], [1, 2, 3])", + "type": "scalar_function_set", + "aliases": ["list_negative_dot_product", "<#>"] }, { "name": "unpivot_list", diff --git a/src/core_functions/scalar/list/list_cosine_similarity.cpp b/src/core_functions/scalar/list/list_cosine_similarity.cpp deleted file mode 100644 index 97fcc5c033d6..000000000000 --- a/src/core_functions/scalar/list/list_cosine_similarity.cpp +++ /dev/null @@ -1,85 +0,0 @@ -#include "duckdb/core_functions/scalar/list_functions.hpp" -#include -#include - -namespace duckdb { - -template -static void ListCosineSimilarity(DataChunk &args, ExpressionState &, Vector &result) { - D_ASSERT(args.ColumnCount() == 2); - - auto count = args.size(); - auto &left = args.data[0]; - auto &right = args.data[1]; - auto left_count = ListVector::GetListSize(left); - auto right_count = ListVector::GetListSize(right); - - auto &left_child = ListVector::GetEntry(left); - auto &right_child = ListVector::GetEntry(right); - - left_child.Flatten(left_count); - right_child.Flatten(right_count); - - D_ASSERT(left_child.GetVectorType() == VectorType::FLAT_VECTOR); - D_ASSERT(right_child.GetVectorType() == VectorType::FLAT_VECTOR); - - if (!FlatVector::Validity(left_child).CheckAllValid(left_count)) { - throw InvalidInputException("list_cosine_similarity: left argument can not contain NULL values"); - } - - if (!FlatVector::Validity(right_child).CheckAllValid(right_count)) { - throw InvalidInputException("list_cosine_similarity: right argument can not contain NULL values"); - } - - auto left_data = FlatVector::GetData(left_child); - auto right_data = FlatVector::GetData(right_child); - - BinaryExecutor::Execute( - left, right, result, count, [&](list_entry_t left, list_entry_t right) { - if (left.length != right.length) { - throw InvalidInputException(StringUtil::Format( - "list_cosine_similarity: list dimensions must be equal, got left length %d and right length %d", - left.length, right.length)); - } - - auto dimensions = left.length; - - if (dimensions == 0) { - throw InvalidInputException("The cosine similarity for empty vectors is not defined"); - } - - NUMERIC_TYPE distance = 0; - NUMERIC_TYPE norm_l = 0; - NUMERIC_TYPE norm_r = 0; - - auto l_ptr = left_data + left.offset; - auto r_ptr = right_data + right.offset; - for (idx_t i = 0; i < dimensions; i++) { - auto x = *l_ptr++; - auto y = *r_ptr++; - distance += x * y; - norm_l += x * x; - norm_r += y * y; - } - - auto similarity = distance / (std::sqrt(norm_l) * std::sqrt(norm_r)); - - // clamp to [-1, 1] to avoid floating point errors - return std::max(static_cast(-1), std::min(similarity, static_cast(1))); - }); - - if (args.AllConstant()) { - result.SetVectorType(VectorType::CONSTANT_VECTOR); - } -} - -ScalarFunctionSet ListCosineSimilarityFun::GetFunctions() { - ScalarFunctionSet set("list_cosine_similarity"); - set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::FLOAT), LogicalType::LIST(LogicalType::FLOAT)}, - LogicalType::FLOAT, ListCosineSimilarity)); - set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::DOUBLE), LogicalType::LIST(LogicalType::DOUBLE)}, - LogicalType::DOUBLE, ListCosineSimilarity)); - return set; -} - -} // namespace duckdb diff --git a/src/core_functions/scalar/list/list_distance.cpp b/src/core_functions/scalar/list/list_distance.cpp index 476235ce096c..23e19f87101c 100644 --- a/src/core_functions/scalar/list/list_distance.cpp +++ b/src/core_functions/scalar/list/list_distance.cpp @@ -1,61 +1,64 @@ #include "duckdb/core_functions/scalar/list_functions.hpp" -#include +#include "duckdb/core_functions/array_kernels.hpp" +#include "duckdb/planner/expression/bound_function_expression.hpp" namespace duckdb { -template -static void ListDistance(DataChunk &args, ExpressionState &, Vector &result) { - D_ASSERT(args.ColumnCount() == 2); +//------------------------------------------------------------------------------ +// Generic "fold" function +//------------------------------------------------------------------------------ +// Given two lists of the same size, combine and reduce their elements into a +// single scalar value. + +template +static void ListGenericFold(DataChunk &args, ExpressionState &state, Vector &result) { + const auto &lstate = state.Cast(); + const auto &expr = lstate.expr.Cast(); + const auto &func_name = expr.function.name; auto count = args.size(); - auto &left = args.data[0]; - auto &right = args.data[1]; - auto left_count = ListVector::GetListSize(left); - auto right_count = ListVector::GetListSize(right); - auto &left_child = ListVector::GetEntry(left); - auto &right_child = ListVector::GetEntry(right); + auto &lhs_vec = args.data[0]; + auto &rhs_vec = args.data[1]; + + const auto lhs_count = ListVector::GetListSize(lhs_vec); + const auto rhs_count = ListVector::GetListSize(rhs_vec); - left_child.Flatten(left_count); - right_child.Flatten(right_count); + auto &lhs_child = ListVector::GetEntry(lhs_vec); + auto &rhs_child = ListVector::GetEntry(rhs_vec); - D_ASSERT(left_child.GetVectorType() == VectorType::FLAT_VECTOR); - D_ASSERT(right_child.GetVectorType() == VectorType::FLAT_VECTOR); + lhs_child.Flatten(lhs_count); + rhs_child.Flatten(rhs_count); - if (!FlatVector::Validity(left_child).CheckAllValid(left_count)) { - throw InvalidInputException("list_distance: left argument can not contain NULL values"); + D_ASSERT(lhs_child.GetVectorType() == VectorType::FLAT_VECTOR); + D_ASSERT(rhs_child.GetVectorType() == VectorType::FLAT_VECTOR); + + if (!FlatVector::Validity(lhs_child).CheckAllValid(lhs_count)) { + throw InvalidInputException("%s: left argument can not contain NULL values", func_name); } - if (!FlatVector::Validity(right_child).CheckAllValid(right_count)) { - throw InvalidInputException("list_distance: right argument can not contain NULL values"); + if (!FlatVector::Validity(rhs_child).CheckAllValid(rhs_count)) { + throw InvalidInputException("%s: right argument can not contain NULL values", func_name); } - auto left_data = FlatVector::GetData(left_child); - auto right_data = FlatVector::GetData(right_child); + auto lhs_data = FlatVector::GetData(lhs_child); + auto rhs_data = FlatVector::GetData(rhs_child); - BinaryExecutor::Execute( - left, right, result, count, [&](list_entry_t left, list_entry_t right) { + BinaryExecutor::ExecuteWithNulls( + lhs_vec, rhs_vec, result, count, + [&](const list_entry_t &left, const list_entry_t &right, ValidityMask &mask, idx_t row_idx) { if (left.length != right.length) { - throw InvalidInputException(StringUtil::Format( - "list_distance: list dimensions must be equal, got left length %d and right length %d", left.length, - right.length)); + throw InvalidInputException( + "%s: list dimensions must be equal, got left length '%d' and right length '%d'", func_name, + left.length, right.length); } - auto dimensions = left.length; - - NUMERIC_TYPE distance = 0; - - auto l_ptr = left_data + left.offset; - auto r_ptr = right_data + right.offset; - - for (idx_t i = 0; i < dimensions; i++) { - auto x = *l_ptr++; - auto y = *r_ptr++; - auto diff = x - y; - distance += diff * diff; + if (!OP::ALLOW_EMPTY && left.length == 0) { + mask.SetInvalid(row_idx); + return TYPE(); } - return std::sqrt(distance); + return OP::Operation(lhs_data + left.offset, rhs_data + right.offset, left.length); }); if (args.AllConstant()) { @@ -63,12 +66,59 @@ static void ListDistance(DataChunk &args, ExpressionState &, Vector &result) { } } +//------------------------------------------------------------------------- +// Function Registration +//------------------------------------------------------------------------- + +template +static void AddListFoldFunction(ScalarFunctionSet &set, const LogicalType &type) { + const auto list = LogicalType::LIST(type); + if (type.id() == LogicalTypeId::FLOAT) { + set.AddFunction(ScalarFunction({list, list}, type, ListGenericFold)); + } else if (type.id() == LogicalTypeId::DOUBLE) { + set.AddFunction(ScalarFunction({list, list}, type, ListGenericFold)); + } else { + throw NotImplementedException("List function not implemented for type %s", type.ToString()); + } +} + ScalarFunctionSet ListDistanceFun::GetFunctions() { ScalarFunctionSet set("list_distance"); - set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::FLOAT), LogicalType::LIST(LogicalType::FLOAT)}, - LogicalType::FLOAT, ListDistance)); - set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::DOUBLE), LogicalType::LIST(LogicalType::DOUBLE)}, - LogicalType::DOUBLE, ListDistance)); + for (auto &type : LogicalType::Real()) { + AddListFoldFunction(set, type); + } + return set; +} + +ScalarFunctionSet ListInnerProductFun::GetFunctions() { + ScalarFunctionSet set("list_inner_product"); + for (auto &type : LogicalType::Real()) { + AddListFoldFunction(set, type); + } + return set; +} + +ScalarFunctionSet ListNegativeInnerProductFun::GetFunctions() { + ScalarFunctionSet set("list_negative_inner_product"); + for (auto &type : LogicalType::Real()) { + AddListFoldFunction(set, type); + } + return set; +} + +ScalarFunctionSet ListCosineSimilarityFun::GetFunctions() { + ScalarFunctionSet set("list_cosine_similarity"); + for (auto &type : LogicalType::Real()) { + AddListFoldFunction(set, type); + } + return set; +} + +ScalarFunctionSet ListCosineDistanceFun::GetFunctions() { + ScalarFunctionSet set("list_cosine_distance"); + for (auto &type : LogicalType::Real()) { + AddListFoldFunction(set, type); + } return set; } diff --git a/src/core_functions/scalar/list/list_inner_product.cpp b/src/core_functions/scalar/list/list_inner_product.cpp deleted file mode 100644 index 9ec7d38c2284..000000000000 --- a/src/core_functions/scalar/list/list_inner_product.cpp +++ /dev/null @@ -1,73 +0,0 @@ -#include "duckdb/core_functions/scalar/list_functions.hpp" - -namespace duckdb { - -template -static void ListInnerProduct(DataChunk &args, ExpressionState &, Vector &result) { - D_ASSERT(args.ColumnCount() == 2); - - auto count = args.size(); - auto &left = args.data[0]; - auto &right = args.data[1]; - auto left_count = ListVector::GetListSize(left); - auto right_count = ListVector::GetListSize(right); - - auto &left_child = ListVector::GetEntry(left); - auto &right_child = ListVector::GetEntry(right); - - left_child.Flatten(left_count); - right_child.Flatten(right_count); - - D_ASSERT(left_child.GetVectorType() == VectorType::FLAT_VECTOR); - D_ASSERT(right_child.GetVectorType() == VectorType::FLAT_VECTOR); - - if (!FlatVector::Validity(left_child).CheckAllValid(left_count)) { - throw InvalidInputException("list_inner_product: left argument can not contain NULL values"); - } - - if (!FlatVector::Validity(right_child).CheckAllValid(right_count)) { - throw InvalidInputException("list_inner_product: right argument can not contain NULL values"); - } - - auto left_data = FlatVector::GetData(left_child); - auto right_data = FlatVector::GetData(right_child); - - BinaryExecutor::Execute( - left, right, result, count, [&](list_entry_t left, list_entry_t right) { - if (left.length != right.length) { - throw InvalidInputException(StringUtil::Format( - "list_inner_product: list dimensions must be equal, got left length %d and right length %d", - left.length, right.length)); - } - - auto dimensions = left.length; - - NUMERIC_TYPE distance = 0; - - auto l_ptr = left_data + left.offset; - auto r_ptr = right_data + right.offset; - - for (idx_t i = 0; i < dimensions; i++) { - auto x = *l_ptr++; - auto y = *r_ptr++; - distance += x * y; - } - - return distance; - }); - - if (args.AllConstant()) { - result.SetVectorType(VectorType::CONSTANT_VECTOR); - } -} - -ScalarFunctionSet ListInnerProductFun::GetFunctions() { - ScalarFunctionSet set("list_inner_product"); - set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::FLOAT), LogicalType::LIST(LogicalType::FLOAT)}, - LogicalType::FLOAT, ListInnerProduct)); - set.AddFunction(ScalarFunction({LogicalType::LIST(LogicalType::DOUBLE), LogicalType::LIST(LogicalType::DOUBLE)}, - LogicalType::DOUBLE, ListInnerProduct)); - return set; -} - -} // namespace duckdb diff --git a/src/execution/index/art/art.cpp b/src/execution/index/art/art.cpp index a9bba85ebff1..be4beef1988c 100644 --- a/src/execution/index/art/art.cpp +++ b/src/execution/index/art/art.cpp @@ -873,7 +873,7 @@ bool ART::SearchLess(ARTKey &upper_bound, bool equal, idx_t max_count, unsafe_ve it.FindMinimum(tree); // Early-out, if the minimum value is higher than the upper bound. - if (it.current_key.GreaterThan(upper_bound, equal)) { + if (it.current_key.GreaterThan(upper_bound, equal, it.GetNestedDepth())) { return true; } diff --git a/src/execution/index/art/iterator.cpp b/src/execution/index/art/iterator.cpp index ca6f5c713051..3f1f1f4f1134 100644 --- a/src/execution/index/art/iterator.cpp +++ b/src/execution/index/art/iterator.cpp @@ -23,7 +23,7 @@ bool IteratorKey::Contains(const ARTKey &key) const { return true; } -bool IteratorKey::GreaterThan(const ARTKey &key, const bool equal) const { +bool IteratorKey::GreaterThan(const ARTKey &key, const bool equal, const uint8_t nested_depth) const { for (idx_t i = 0; i < MinValue(Size(), key.len); i++) { if (key_bytes[i] > key.data[i]) { return true; @@ -31,12 +31,11 @@ bool IteratorKey::GreaterThan(const ARTKey &key, const bool equal) const { return false; } } - if (equal) { - // Returns true, if current_key is greater than key. - return Size() > key.len; - } - // Returns true, if current_key and key match or current_key is greater than key. - return Size() >= key.len; + + // Returns true, if current_key is greater than (or equal to) key. + D_ASSERT(Size() >= nested_depth); + auto this_len = Size() - nested_depth; + return equal ? this_len > key.len : this_len >= key.len; } //===--------------------------------------------------------------------===// @@ -48,7 +47,7 @@ bool Iterator::Scan(const ARTKey &upper_bound, const idx_t max_count, unsafe_vec do { // An empty upper bound indicates that no upper bound exists. if (!upper_bound.Empty() && status == GateStatus::GATE_NOT_SET) { - if (current_key.GreaterThan(upper_bound, equal)) { + if (current_key.GreaterThan(upper_bound, equal, nested_depth)) { return true; } } diff --git a/src/execution/join_hashtable.cpp b/src/execution/join_hashtable.cpp index 9ccc105d0f68..c46745eb18f5 100644 --- a/src/execution/join_hashtable.cpp +++ b/src/execution/join_hashtable.cpp @@ -499,6 +499,7 @@ static inline void PerformKeyComparison(JoinHashTable::InsertState &state, JoinH // The target selection vector says where to write the results into the lhs_data, we just want to write // sequentially as otherwise we trigger a bug in the Gather function + data_collection.ResetCachedCastVectors(state.chunk_state, ht.equality_predicate_columns); data_collection.Gather(row_locations, state.salt_match_sel, count, ht.equality_predicate_columns, state.lhs_data, *FlatVector::IncrementalSelectionVector(), state.chunk_state.cached_cast_vectors); TupleDataCollection::ToUnifiedFormat(state.chunk_state, state.lhs_data); diff --git a/src/execution/operator/csv_scanner/scanner/column_count_scanner.cpp b/src/execution/operator/csv_scanner/scanner/column_count_scanner.cpp index 6e0d6048b6db..f66b180e786e 100644 --- a/src/execution/operator/csv_scanner/scanner/column_count_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/column_count_scanner.cpp @@ -36,7 +36,7 @@ bool ColumnCountResult::AddRow(ColumnCountResult &result, idx_t buffer_pos) { } void ColumnCountResult::SetComment(ColumnCountResult &result, idx_t buffer_pos) { - if (result.current_column_count == 0) { + if (!result.states.WasStandard()) { result.cur_line_starts_as_comment = true; } result.comment = true; diff --git a/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp b/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp index de859d5bf147..44d179091c2e 100644 --- a/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +++ b/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp @@ -518,7 +518,7 @@ void CSVSniffer::DetectDialect() { // if no dialect candidate was found, we throw an exception if (candidates.empty()) { - auto error = CSVError::DialectSniffingError(options, dialect_candidates.Print()); + auto error = CSVError::SniffingError(options, dialect_candidates.Print()); error_handler->Error(error); } } diff --git a/src/execution/operator/csv_scanner/sniffer/type_detection.cpp b/src/execution/operator/csv_scanner/sniffer/type_detection.cpp index a61acd6a290b..11d79c40d820 100644 --- a/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +++ b/src/execution/operator/csv_scanner/sniffer/type_detection.cpp @@ -413,6 +413,20 @@ void CSVSniffer::DetectTypes() { SetUserDefinedDateTimeFormat(*candidate->state_machine); // Parse chunk and read csv with info candidate auto &data_chunk = candidate->ParseChunk().ToChunk(); + if (!candidate->error_handler->errors.empty()) { + bool break_loop = false; + for (auto &errors : candidate->error_handler->errors) { + for (auto &error : errors.second) { + if (error.type != CSVErrorType::MAXIMUM_LINE_SIZE) { + break_loop = true; + break; + } + } + } + if (break_loop) { + continue; + } + } idx_t start_idx_detection = 0; idx_t chunk_size = data_chunk.size(); if (chunk_size > 1 && @@ -465,6 +479,11 @@ void CSVSniffer::DetectTypes() { } } } + if (!best_candidate) { + DialectCandidates dialect_candidates(options.dialect_options.state_machine_options); + auto error = CSVError::SniffingError(options, dialect_candidates.Print()); + error_handler->Error(error); + } // Assert that it's all good at this point. D_ASSERT(best_candidate && !best_format_candidates.empty()); } diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 14e76ee1fec1..4f3e9dce824e 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -41,6 +41,11 @@ CSVGlobalState::CSVGlobalState(ClientContext &context_p, const shared_ptr(*file_scans.back()->buffer_manager, current_boundary.GetBufferIdx()); } +bool CSVGlobalState::IsDone() const { + lock_guard parallel_lock(main_mutex); + return current_boundary.done; +} + double CSVGlobalState::GetProgress(const ReadCSVData &bind_data_p) const { lock_guard parallel_lock(main_mutex); idx_t total_files = bind_data.files.size(); diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index 30450d5c2913..e7a41f3a63fa 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -226,12 +226,12 @@ CSVError CSVError::HeaderSniffingError(const CSVReaderOptions &options, const ve return CSVError(error.str(), SNIFFING, {}); } -CSVError CSVError::DialectSniffingError(const CSVReaderOptions &options, const string &search_space) { +CSVError CSVError::SniffingError(const CSVReaderOptions &options, const string &search_space) { std::ostringstream error; // 1. Which file error << "Error when sniffing file \"" << options.file_path << "\"." << '\n'; // 2. What's the error - error << "It was not possible to automatically detect the CSV Parsing dialect" << '\n'; + error << "It was not possible to automatically detect the CSV Parsing dialect/types" << '\n'; // 2. What was the search space? error << "The search space used was:" << '\n'; diff --git a/src/execution/operator/helper/physical_load.cpp b/src/execution/operator/helper/physical_load.cpp index e20b5af0d738..5f0e7a027fb7 100644 --- a/src/execution/operator/helper/physical_load.cpp +++ b/src/execution/operator/helper/physical_load.cpp @@ -16,15 +16,23 @@ static void InstallFromRepository(ClientContext &context, const LoadInfo &info) repository = ExtensionRepository::GetRepositoryByUrl(info.repository); } - ExtensionHelper::InstallExtension(context, info.filename, info.load_type == LoadType::FORCE_INSTALL, repository, - true, info.version); + ExtensionInstallOptions options; + options.force_install = info.load_type == LoadType::FORCE_INSTALL; + options.throw_on_origin_mismatch = true; + options.version = info.version; + options.repository = repository; + + ExtensionHelper::InstallExtension(context, info.filename, options); } SourceResultType PhysicalLoad::GetData(ExecutionContext &context, DataChunk &chunk, OperatorSourceInput &input) const { if (info->load_type == LoadType::INSTALL || info->load_type == LoadType::FORCE_INSTALL) { if (info->repository.empty()) { - ExtensionHelper::InstallExtension(context.client, info->filename, - info->load_type == LoadType::FORCE_INSTALL, nullptr, true, info->version); + ExtensionInstallOptions options; + options.force_install = info->load_type == LoadType::FORCE_INSTALL; + options.throw_on_origin_mismatch = true; + options.version = info->version; + ExtensionHelper::InstallExtension(context.client, info->filename, options); } else { InstallFromRepository(context.client, *info); } diff --git a/src/function/table/arrow_conversion.cpp b/src/function/table/arrow_conversion.cpp index de59b33eac09..b83bbf568db4 100644 --- a/src/function/table/arrow_conversion.cpp +++ b/src/function/table/arrow_conversion.cpp @@ -1,11 +1,13 @@ -#include "duckdb/function/table/arrow.hpp" +#include "duckdb/common/exception/conversion_exception.hpp" #include "duckdb/common/limits.hpp" #include "duckdb/common/operator/multiply.hpp" -#include "duckdb/common/types/hugeint.hpp" #include "duckdb/common/types/arrow_aux_data.hpp" -#include "duckdb/function/scalar/nested_functions.hpp" -#include "duckdb/common/exception/conversion_exception.hpp" #include "duckdb/common/types/arrow_string_view_type.hpp" +#include "duckdb/common/types/hugeint.hpp" +#include "duckdb/function/scalar/nested_functions.hpp" +#include "duckdb/function/table/arrow.hpp" + +#include "duckdb/common/bswap.hpp" namespace duckdb { @@ -36,7 +38,7 @@ static void ShiftRight(unsigned char *ar, int size, int shift) { } } -idx_t GetEffectiveOffset(ArrowArray &array, int64_t parent_offset, const ArrowScanLocalState &state, +idx_t GetEffectiveOffset(const ArrowArray &array, int64_t parent_offset, const ArrowScanLocalState &state, int64_t nested_offset = -1) { if (nested_offset != -1) { // The parent of this array is a list @@ -108,7 +110,7 @@ static void SetValidityMask(Vector &vector, ArrowArray &array, const ArrowScanLo GetValidityMask(mask, array, scan_state, size, parent_offset, nested_offset, add_null); } -static void ColumnArrowToDuckDBRunEndEncoded(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, +static void ColumnArrowToDuckDBRunEndEncoded(Vector &vector, const ArrowArray &array, ArrowArrayScanState &array_state, idx_t size, const ArrowType &arrow_type, int64_t nested_offset = -1, ValidityMask *parent_mask = nullptr, uint64_t parent_offset = 0); @@ -118,7 +120,7 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArraySca static void ColumnArrowToDuckDBDictionary(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, idx_t size, const ArrowType &arrow_type, int64_t nested_offset = -1, - ValidityMask *parent_mask = nullptr, uint64_t parent_offset = 0); + const ValidityMask *parent_mask = nullptr, uint64_t parent_offset = 0); namespace { @@ -211,7 +213,7 @@ static ArrowListOffsetData ConvertArrowListOffsets(Vector &vector, ArrowArray &a } static void ArrowToDuckDBList(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, idx_t size, - const ArrowType &arrow_type, int64_t nested_offset, ValidityMask *parent_mask, + const ArrowType &arrow_type, int64_t nested_offset, const ValidityMask *parent_mask, int64_t parent_offset) { auto &scan_state = array_state.state; @@ -270,7 +272,7 @@ static void ArrowToDuckDBList(Vector &vector, ArrowArray &array, ArrowArrayScanS } static void ArrowToDuckDBArray(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, idx_t size, - const ArrowType &arrow_type, int64_t nested_offset, ValidityMask *parent_mask, + const ArrowType &arrow_type, int64_t nested_offset, const ValidityMask *parent_mask, int64_t parent_offset) { auto &array_info = arrow_type.GetTypeInfo(); @@ -452,17 +454,35 @@ static void TimeConversion(Vector &vector, ArrowArray &array, const ArrowScanLoc int64_t nested_offset, int64_t parent_offset, idx_t size, int64_t conversion) { auto tgt_ptr = FlatVector::GetData(vector); auto &validity_mask = FlatVector::Validity(vector); - auto src_ptr = (T *)array.buffers[1] + GetEffectiveOffset(array, parent_offset, scan_state, nested_offset); + auto src_ptr = + static_cast(array.buffers[1]) + GetEffectiveOffset(array, parent_offset, scan_state, nested_offset); for (idx_t row = 0; row < size; row++) { if (!validity_mask.RowIsValid(row)) { continue; } - if (!TryMultiplyOperator::Operation((int64_t)src_ptr[row], conversion, tgt_ptr[row].micros)) { + if (!TryMultiplyOperator::Operation(static_cast(src_ptr[row]), conversion, tgt_ptr[row].micros)) { throw ConversionException("Could not convert Time to Microsecond"); } } } +static void UUIDConversion(Vector &vector, const ArrowArray &array, const ArrowScanLocalState &scan_state, + int64_t nested_offset, int64_t parent_offset, idx_t size) { + auto tgt_ptr = FlatVector::GetData(vector); + auto &validity_mask = FlatVector::Validity(vector); + auto src_ptr = static_cast(array.buffers[1]) + + GetEffectiveOffset(array, parent_offset, scan_state, nested_offset); + for (idx_t row = 0; row < size; row++) { + if (!validity_mask.RowIsValid(row)) { + continue; + } + tgt_ptr[row].lower = static_cast(BSwap(src_ptr[row].upper)); + // flip Upper MSD + tgt_ptr[row].upper = + static_cast(static_cast(BSwap(src_ptr[row].lower)) ^ (static_cast(1) << 63)); + } +} + static void TimestampTZConversion(Vector &vector, ArrowArray &array, const ArrowScanLocalState &scan_state, int64_t nested_offset, int64_t parent_offset, idx_t size, int64_t conversion) { auto tgt_ptr = FlatVector::GetData(vector); @@ -569,7 +589,7 @@ static void FlattenRunEnds(Vector &result, ArrowRunEndEncodingState &run_end_enc idx_t index = 0; if (value_format.validity.AllValid()) { // None of the compressed values are NULL - for (; run < compressed_size; run++) { + for (; run < compressed_size; ++run) { auto run_end_index = run_end_format.sel->get_index(run); auto value_index = value_format.sel->get_index(run); auto &value = values_data[value_index]; @@ -587,13 +607,13 @@ static void FlattenRunEnds(Vector &result, ArrowRunEndEncodingState &run_end_enc if (index >= count) { if (logical_index + index >= run_end) { // The last run was completed, forward the run index - run++; + ++run; } break; } } } else { - for (; run < compressed_size; run++) { + for (; run < compressed_size; ++run) { auto run_end_index = run_end_format.sel->get_index(run); auto value_index = value_format.sel->get_index(run); auto run_end = static_cast(run_ends_data[run_end_index]); @@ -618,7 +638,7 @@ static void FlattenRunEnds(Vector &result, ArrowRunEndEncodingState &run_end_enc if (index >= count) { if (logical_index + index >= run_end) { // The last run was completed, forward the run index - run++; + ++run; } break; } @@ -683,7 +703,7 @@ static void FlattenRunEndsSwitch(Vector &result, ArrowRunEndEncodingState &run_e } } -static void ColumnArrowToDuckDBRunEndEncoded(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, +static void ColumnArrowToDuckDBRunEndEncoded(Vector &vector, const ArrowArray &array, ArrowArrayScanState &array_state, idx_t size, const ArrowType &arrow_type, int64_t nested_offset, ValidityMask *parent_mask, uint64_t parent_offset) { // Scan the 'run_ends' array @@ -778,7 +798,6 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArraySca case LogicalTypeId::UBIGINT: case LogicalTypeId::BIGINT: case LogicalTypeId::HUGEINT: - case LogicalTypeId::UUID: case LogicalTypeId::UHUGEINT: case LogicalTypeId::TIMESTAMP: case LogicalTypeId::TIMESTAMP_SEC: @@ -788,6 +807,9 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArraySca DirectConversion(vector, array, scan_state, nested_offset, parent_offset); break; } + case LogicalTypeId::UUID: + UUIDConversion(vector, array, scan_state, nested_offset, NumericCast(parent_offset), size); + break; case LogicalTypeId::VARCHAR: { auto &string_info = arrow_type.GetTypeInfo(); auto size_type = string_info.GetSizeType(); @@ -830,8 +852,8 @@ static void ColumnArrowToDuckDB(Vector &vector, ArrowArray &array, ArrowArraySca GetEffectiveOffset(array, NumericCast(parent_offset), scan_state, nested_offset); auto tgt_ptr = FlatVector::GetData(vector); for (idx_t row = 0; row < size; row++) { - tgt_ptr[row] = date_t( - UnsafeNumericCast(int64_t(src_ptr[row]) / static_cast(1000 * 60 * 60 * 24))); + tgt_ptr[row] = date_t(UnsafeNumericCast(static_cast(src_ptr[row]) / + static_cast(1000 * 60 * 60 * 24))); } break; } @@ -1153,7 +1175,7 @@ static void SetMaskedSelectionVectorLoop(SelectionVector &sel, data_ptr_t indice } } -static void SetSelectionVector(SelectionVector &sel, data_ptr_t indices_p, LogicalType &logical_type, idx_t size, +static void SetSelectionVector(SelectionVector &sel, data_ptr_t indices_p, const LogicalType &logical_type, idx_t size, ValidityMask *mask = nullptr, idx_t last_element_pos = 0) { sel.Initialize(size); @@ -1242,7 +1264,7 @@ static void SetSelectionVector(SelectionVector &sel, data_ptr_t indices_p, Logic } } -static bool CanContainNull(ArrowArray &array, ValidityMask *parent_mask) { +static bool CanContainNull(const ArrowArray &array, const ValidityMask *parent_mask) { if (array.null_count > 0) { return true; } @@ -1254,7 +1276,7 @@ static bool CanContainNull(ArrowArray &array, ValidityMask *parent_mask) { static void ColumnArrowToDuckDBDictionary(Vector &vector, ArrowArray &array, ArrowArrayScanState &array_state, idx_t size, const ArrowType &arrow_type, int64_t nested_offset, - ValidityMask *parent_mask, uint64_t parent_offset) { + const ValidityMask *parent_mask, uint64_t parent_offset) { D_ASSERT(arrow_type.HasDictionary()); auto &scan_state = array_state.state; const bool has_nulls = CanContainNull(array, parent_mask); diff --git a/src/function/table/copy_csv.cpp b/src/function/table/copy_csv.cpp index a94df0b7a6ad..b2c16a671721 100644 --- a/src/function/table/copy_csv.cpp +++ b/src/function/table/copy_csv.cpp @@ -182,13 +182,13 @@ static unique_ptr WriteCSVBind(ClientContext &context, CopyFunctio switch (bind_data->options.compression) { case FileCompressionType::GZIP: - if (!StringUtil::EndsWith(input.file_extension, ".gz")) { - input.file_extension += ".gz"; + if (!IsFileCompressed(input.file_extension, FileCompressionType::GZIP)) { + input.file_extension += CompressionExtensionFromType(FileCompressionType::GZIP); } break; case FileCompressionType::ZSTD: - if (!StringUtil::EndsWith(input.file_extension, ".zst")) { - input.file_extension += ".zst"; + if (!IsFileCompressed(input.file_extension, FileCompressionType::ZSTD)) { + input.file_extension += CompressionExtensionFromType(FileCompressionType::ZSTD); } break; default: diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index 7d78c8f23561..58948af7f037 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -205,7 +205,7 @@ unique_ptr ReadCSVInitLocal(ExecutionContext &context, return nullptr; } auto &global_state = global_state_p->Cast(); - if (global_state.current_boundary.done) { + if (global_state.IsDone()) { // nothing to do return nullptr; } @@ -390,9 +390,9 @@ unique_ptr ReadCSVReplacement(ClientContext &context, ReplacementScanI auto table_name = ReplacementScan::GetFullPath(input); auto lower_name = StringUtil::Lower(table_name); // remove any compression - if (StringUtil::EndsWith(lower_name, ".gz")) { + if (StringUtil::EndsWith(lower_name, CompressionExtensionFromType(FileCompressionType::GZIP))) { lower_name = lower_name.substr(0, lower_name.size() - 3); - } else if (StringUtil::EndsWith(lower_name, ".zst")) { + } else if (StringUtil::EndsWith(lower_name, CompressionExtensionFromType(FileCompressionType::ZSTD))) { if (!Catalog::TryAutoLoad(context, "parquet")) { throw MissingExtensionException("parquet extension is required for reading zst compressed file"); } diff --git a/src/function/table/sniff_csv.cpp b/src/function/table/sniff_csv.cpp index 0df333517afd..11e5cca83a20 100644 --- a/src/function/table/sniff_csv.cpp +++ b/src/function/table/sniff_csv.cpp @@ -110,18 +110,20 @@ static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p, const CSVSniffFunctionData &data = data_p.bind_data->Cast(); auto &fs = duckdb::FileSystem::GetFileSystem(context); - if (data.path.rfind("http://", 0) != 0 && data.path.rfind("https://", 0) != 0 && fs.HasGlob(data.path)) { - throw NotImplementedException("sniff_csv does not operate on globs yet"); + auto paths = fs.GlobFiles(data.path, context, FileGlobOptions::DISALLOW_EMPTY); + if (paths.size() > 1) { + throw NotImplementedException("sniff_csv does not operate on more than one file yet"); } // We must run the sniffer. auto sniffer_options = data.options; - sniffer_options.file_path = data.path; + sniffer_options.file_path = paths[0]; auto buffer_manager = make_shared_ptr(context, sniffer_options, sniffer_options.file_path, 0); if (sniffer_options.name_list.empty()) { sniffer_options.name_list = data.names_csv; } + if (sniffer_options.sql_type_list.empty()) { sniffer_options.sql_type_list = data.return_types_csv; } @@ -204,7 +206,7 @@ static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p, std::ostringstream csv_read; // Base, Path and auto_detect=false - csv_read << "FROM read_csv('" << data.path << "'" << separator << "auto_detect=false" << separator; + csv_read << "FROM read_csv('" << paths[0] << "'" << separator << "auto_detect=false" << separator; // 10.1. Delimiter if (!sniffer_options.dialect_options.state_machine_options.delimiter.IsSetByUser()) { csv_read << "delim=" diff --git a/src/function/table/table_scan.cpp b/src/function/table/table_scan.cpp index 938b443d49f9..7733bf2b42cf 100644 --- a/src/function/table/table_scan.cpp +++ b/src/function/table/table_scan.cpp @@ -86,7 +86,6 @@ static unique_ptr TableScanInitLocal(ExecutionContext & } unique_ptr TableScanInitGlobal(ClientContext &context, TableFunctionInitInput &input) { - D_ASSERT(input.bind_data); auto &bind_data = input.bind_data->Cast(); auto result = make_uniq(context, input.bind_data.get()); diff --git a/src/include/duckdb.h b/src/include/duckdb.h index 51878906b60b..b66253fcc689 100644 --- a/src/include/duckdb.h +++ b/src/include/duckdb.h @@ -16,11 +16,15 @@ //! duplicate of duckdb/main/winapi.hpp #ifndef DUCKDB_API #ifdef _WIN32 +#ifdef DUCKDB_STATIC_BUILD +#define DUCKDB_API +#else #if defined(DUCKDB_BUILD_LIBRARY) && !defined(DUCKDB_BUILD_LOADABLE_EXTENSION) #define DUCKDB_API __declspec(dllexport) #else #define DUCKDB_API __declspec(dllimport) #endif +#endif #else #define DUCKDB_API #endif @@ -29,11 +33,15 @@ //! duplicate of duckdb/main/winapi.hpp #ifndef DUCKDB_EXTENSION_API #ifdef _WIN32 +#ifdef DUCKDB_STATIC_BUILD +#define DUCKDB_EXTENSION_API +#else #ifdef DUCKDB_BUILD_LOADABLE_EXTENSION #define DUCKDB_EXTENSION_API __declspec(dllexport) #else #define DUCKDB_EXTENSION_API #endif +#endif #else #define DUCKDB_EXTENSION_API __attribute__((visibility("default"))) #endif @@ -1127,8 +1135,8 @@ DUCKDB_API duckdb_timestamp duckdb_value_timestamp(duckdb_result *result, idx_t DUCKDB_API duckdb_interval duckdb_value_interval(duckdb_result *result, idx_t col, idx_t row); /*! -**DEPRECATION NOTICE**: use duckdb_value_string instead. This function does not work correctly if the string contains -null bytes. +**DEPRECATED**: Use duckdb_value_string instead. This function does not work correctly if the string contains null +bytes. * @return The text value at the specified location as a null-terminated string, or nullptr if the value cannot be converted. The result must be freed with `duckdb_free`. @@ -1146,8 +1154,8 @@ The resulting field "string.data" must be freed with `duckdb_free.` DUCKDB_API duckdb_string duckdb_value_string(duckdb_result *result, idx_t col, idx_t row); /*! -**DEPRECATION NOTICE**: use duckdb_value_string_internal instead. This function does not work correctly if the string -contains null bytes. +**DEPRECATED**: Use duckdb_value_string_internal instead. This function does not work correctly if the string contains +null bytes. * @return The char* value at the specified location. ONLY works on VARCHAR columns and does not auto-cast. If the column is NOT a VARCHAR column this function will return NULL. @@ -1157,8 +1165,8 @@ The result must NOT be freed. DUCKDB_API char *duckdb_value_varchar_internal(duckdb_result *result, idx_t col, idx_t row); /*! -**DEPRECATION NOTICE**: use duckdb_value_string_internal instead. This function does not work correctly if the string -contains null bytes. +**DEPRECATED**: Use duckdb_value_string_internal instead. This function does not work correctly if the string contains +null bytes. * @return The char* value at the specified location. ONLY works on VARCHAR columns and does not auto-cast. If the column is NOT a VARCHAR column this function will return NULL. @@ -2897,8 +2905,9 @@ DUCKDB_API void duckdb_destroy_scalar_function_set(duckdb_scalar_function_set *s /*! Adds the scalar function as a new overload to the scalar function set. -Returns DuckDBError if the function could not be added, for example if the overload already exists.* @param set The -scalar function set +Returns DuckDBError if the function could not be added, for example if the overload already exists. + +* @param set The scalar function set * @param function The function to add */ DUCKDB_API duckdb_state duckdb_add_scalar_function_to_set(duckdb_scalar_function_set set, @@ -3052,8 +3061,9 @@ DUCKDB_API void duckdb_destroy_aggregate_function_set(duckdb_aggregate_function_ /*! Adds the aggregate function as a new overload to the aggregate function set. -Returns DuckDBError if the function could not be added, for example if the overload already exists.* @param set The -aggregate function set +Returns DuckDBError if the function could not be added, for example if the overload already exists. + +* @param set The aggregate function set * @param function The function to add */ DUCKDB_API duckdb_state duckdb_add_aggregate_function_to_set(duckdb_aggregate_function_set set, diff --git a/src/include/duckdb/common/arrow/appender/scalar_data.hpp b/src/include/duckdb/common/arrow/appender/scalar_data.hpp index efb4e81415d1..00326a6a55c0 100644 --- a/src/include/duckdb/common/arrow/appender/scalar_data.hpp +++ b/src/include/duckdb/common/arrow/appender/scalar_data.hpp @@ -3,6 +3,8 @@ #include "duckdb/common/arrow/appender/append_data.hpp" #include "duckdb/function/table/arrow.hpp" +#include "duckdb/common/bswap.hpp" + namespace duckdb { //===--------------------------------------------------------------------===// @@ -57,6 +59,25 @@ struct ArrowTimeTzConverter { } }; +struct ArrowUUIDBlobConverter { + template + static TGT Operation(hugeint_t input) { + // Turn into big-end + auto upper = BSwap(input.lower); + // flip Upper MSD + auto lower = BSwap(static_cast(static_cast(input.upper) ^ (static_cast(1) << 63))); + return {static_cast(upper), static_cast(lower)}; + } + + static bool SkipNulls() { + return true; + } + + template + static void SetNull(TGT &value) { + } +}; + template struct ArrowScalarBaseData { static void Append(ArrowAppendData &append_data, Vector &input, idx_t from, idx_t to, idx_t input_size) { diff --git a/src/include/duckdb/common/arrow/appender/varchar_data.hpp b/src/include/duckdb/common/arrow/appender/varchar_data.hpp index a12d6c6145e2..12f68e4315d4 100644 --- a/src/include/duckdb/common/arrow/appender/varchar_data.hpp +++ b/src/include/duckdb/common/arrow/appender/varchar_data.hpp @@ -3,6 +3,7 @@ #include "duckdb/common/arrow/appender/append_data.hpp" #include "duckdb/common/arrow/appender/scalar_data.hpp" #include "duckdb/common/types/arrow_string_view_type.hpp" +#include "duckdb/common/types/uuid.hpp" namespace duckdb { @@ -21,6 +22,18 @@ struct ArrowVarcharConverter { } }; +struct ArrowUUIDConverter { + template + static idx_t GetLength(SRC input) { + return UUID::STRING_SIZE; + } + + template + static void WriteData(data_ptr_t target, SRC input) { + UUID::ToString(input, char_ptr_cast(target)); + } +}; + template struct ArrowVarcharData { static void Initialize(ArrowAppendData &result, const LogicalType &type, idx_t capacity) { diff --git a/src/include/duckdb/common/enums/file_compression_type.hpp b/src/include/duckdb/common/enums/file_compression_type.hpp index 98fe5e75c532..77b66b3920d4 100644 --- a/src/include/duckdb/common/enums/file_compression_type.hpp +++ b/src/include/duckdb/common/enums/file_compression_type.hpp @@ -16,4 +16,8 @@ enum class FileCompressionType : uint8_t { AUTO_DETECT = 0, UNCOMPRESSED = 1, GZ FileCompressionType FileCompressionTypeFromString(const string &input); +string CompressionExtensionFromType(const FileCompressionType type); + +bool IsFileCompressed(string path, FileCompressionType type); + } // namespace duckdb diff --git a/src/include/duckdb/common/types/row/tuple_data_collection.hpp b/src/include/duckdb/common/types/row/tuple_data_collection.hpp index 1612287d5702..b87b4002401f 100644 --- a/src/include/duckdb/common/types/row/tuple_data_collection.hpp +++ b/src/include/duckdb/common/types/row/tuple_data_collection.hpp @@ -114,6 +114,8 @@ class TupleDataCollection { static void ToUnifiedFormat(TupleDataChunkState &chunk_state, DataChunk &new_chunk); //! Gets the UnifiedVectorFormat from the Chunk state as an array static void GetVectorData(const TupleDataChunkState &chunk_state, UnifiedVectorFormat result[]); + //! Resets the cached cache vectors (used for ARRAY/LIST casts) + static void ResetCachedCastVectors(TupleDataChunkState &chunk_state, const vector &column_ids); //! Computes the heap sizes for the new DataChunk that will be appended static void ComputeHeapSizes(TupleDataChunkState &chunk_state, const DataChunk &new_chunk, const SelectionVector &append_sel, const idx_t append_count); diff --git a/src/include/duckdb/common/types/varint.hpp b/src/include/duckdb/common/types/varint.hpp index 34ac7707ab70..023c2cfef07c 100644 --- a/src/include/duckdb/common/types/varint.hpp +++ b/src/include/duckdb/common/types/varint.hpp @@ -52,7 +52,7 @@ class Varint { //! Function to convert Varchar to VARINT blob DUCKDB_API static string VarcharToVarInt(const string_t &value); //! ----------------------------------- Double Cast ----------------------------------- // - DUCKDB_API static bool VarintToDouble(string_t &blob, double &result, bool &strict); + DUCKDB_API static bool VarintToDouble(const string_t &blob, double &result, bool &strict); }; //! ----------------------------------- (u)Integral Cast ----------------------------------- // diff --git a/src/include/duckdb/common/winapi.hpp b/src/include/duckdb/common/winapi.hpp index 1b77335ca9a8..6579a9ce7791 100644 --- a/src/include/duckdb/common/winapi.hpp +++ b/src/include/duckdb/common/winapi.hpp @@ -10,11 +10,15 @@ #ifndef DUCKDB_API #if defined(_WIN32) && !defined(__MINGW32__) +#ifdef DUCKDB_STATIC_BUILD +#define DUCKDB_API +#else #if defined(DUCKDB_BUILD_LIBRARY) && !defined(DUCKDB_BUILD_LOADABLE_EXTENSION) #define DUCKDB_API __declspec(dllexport) #else #define DUCKDB_API __declspec(dllimport) #endif +#endif #else #define DUCKDB_API #endif @@ -22,11 +26,15 @@ #ifndef DUCKDB_EXTENSION_API #ifdef _WIN32 +#ifdef DUCKDB_STATIC_BUILD +#define DUCKDB_EXTENSION_API +#else #ifdef DUCKDB_BUILD_LOADABLE_EXTENSION #define DUCKDB_EXTENSION_API __declspec(dllexport) #else #define DUCKDB_EXTENSION_API #endif +#endif #else #define DUCKDB_EXTENSION_API __attribute__((visibility("default"))) #endif diff --git a/src/include/duckdb/core_functions/array_kernels.hpp b/src/include/duckdb/core_functions/array_kernels.hpp new file mode 100644 index 000000000000..dd6e29153042 --- /dev/null +++ b/src/include/duckdb/core_functions/array_kernels.hpp @@ -0,0 +1,107 @@ +#pragma once +#include "duckdb/common/typedefs.hpp" +#include "duckdb/common/algorithm.hpp" +#include + +namespace duckdb { + +//------------------------------------------------------------------------- +// Folding Operations +//------------------------------------------------------------------------- +struct InnerProductOp { + static constexpr bool ALLOW_EMPTY = true; + + template + static TYPE Operation(const TYPE *lhs_data, const TYPE *rhs_data, const idx_t count) { + + TYPE result = 0; + + auto lhs_ptr = lhs_data; + auto rhs_ptr = rhs_data; + + for (idx_t i = 0; i < count; i++) { + const auto x = *lhs_ptr++; + const auto y = *rhs_ptr++; + result += x * y; + } + + return result; + } +}; + +struct NegativeInnerProductOp { + static constexpr bool ALLOW_EMPTY = true; + + template + static TYPE Operation(const TYPE *lhs_data, const TYPE *rhs_data, const idx_t count) { + return -InnerProductOp::Operation(lhs_data, rhs_data, count); + } +}; + +struct CosineSimilarityOp { + static constexpr bool ALLOW_EMPTY = false; + + template + static TYPE Operation(const TYPE *lhs_data, const TYPE *rhs_data, const idx_t count) { + + TYPE distance = 0; + TYPE norm_l = 0; + TYPE norm_r = 0; + + auto l_ptr = lhs_data; + auto r_ptr = rhs_data; + + for (idx_t i = 0; i < count; i++) { + const auto x = *l_ptr++; + const auto y = *r_ptr++; + distance += x * y; + norm_l += x * x; + norm_r += y * y; + } + + auto similarity = distance / std::sqrt(norm_l * norm_r); + return std::max(static_cast(-1.0), std::min(similarity, static_cast(1.0))); + } +}; + +struct CosineDistanceOp { + static constexpr bool ALLOW_EMPTY = false; + + template + static TYPE Operation(const TYPE *lhs_data, const TYPE *rhs_data, const idx_t count) { + return static_cast(1.0) - CosineSimilarityOp::Operation(lhs_data, rhs_data, count); + } +}; + +struct DistanceSquaredOp { + static constexpr bool ALLOW_EMPTY = true; + + template + static TYPE Operation(const TYPE *lhs_data, const TYPE *rhs_data, const idx_t count) { + + TYPE distance = 0; + + auto l_ptr = lhs_data; + auto r_ptr = rhs_data; + + for (idx_t i = 0; i < count; i++) { + const auto x = *l_ptr++; + const auto y = *r_ptr++; + const auto diff = x - y; + distance += diff * diff; + } + + return distance; + } +}; + +struct DistanceOp { + static constexpr bool ALLOW_EMPTY = true; + + template + static TYPE Operation(const TYPE *lhs_data, const TYPE *rhs_data, const idx_t count) { + return std::sqrt(DistanceSquaredOp::Operation(lhs_data, rhs_data, count)); + } +}; + +} // namespace duckdb diff --git a/src/include/duckdb/core_functions/scalar/array_functions.hpp b/src/include/duckdb/core_functions/scalar/array_functions.hpp index 1cc09970219e..865d77bf645c 100644 --- a/src/include/duckdb/core_functions/scalar/array_functions.hpp +++ b/src/include/duckdb/core_functions/scalar/array_functions.hpp @@ -42,6 +42,15 @@ struct ArrayCosineSimilarityFun { static ScalarFunctionSet GetFunctions(); }; +struct ArrayCosineDistanceFun { + static constexpr const char *Name = "array_cosine_distance"; + static constexpr const char *Parameters = "array1,array2"; + static constexpr const char *Description = "Compute the cosine distance between two arrays of the same size. The array elements can not be NULL. The arrays can have any size as long as the size is the same for both arguments."; + static constexpr const char *Example = "array_cosine_distance([1, 2, 3], [1, 2, 3])"; + + static ScalarFunctionSet GetFunctions(); +}; + struct ArrayDistanceFun { static constexpr const char *Name = "array_distance"; static constexpr const char *Parameters = "array1,array2"; @@ -66,4 +75,19 @@ struct ArrayDotProductFun { static constexpr const char *Name = "array_dot_product"; }; +struct ArrayNegativeInnerProductFun { + static constexpr const char *Name = "array_negative_inner_product"; + static constexpr const char *Parameters = "array1,array2"; + static constexpr const char *Description = "Compute the negative inner product between two arrays of the same size. The array elements can not be NULL. The arrays can have any size as long as the size is the same for both arguments."; + static constexpr const char *Example = "array_negative_inner_product([1, 2, 3], [1, 2, 3])"; + + static ScalarFunctionSet GetFunctions(); +}; + +struct ArrayNegativeDotProductFun { + using ALIAS = ArrayNegativeInnerProductFun; + + static constexpr const char *Name = "array_negative_dot_product"; +}; + } // namespace duckdb diff --git a/src/include/duckdb/core_functions/scalar/list_functions.hpp b/src/include/duckdb/core_functions/scalar/list_functions.hpp index 5c4b2faac32e..9386278acbc7 100644 --- a/src/include/duckdb/core_functions/scalar/list_functions.hpp +++ b/src/include/duckdb/core_functions/scalar/list_functions.hpp @@ -261,21 +261,30 @@ struct ListRangeFun { static ScalarFunctionSet GetFunctions(); }; -struct ListCosineSimilarityFun { - static constexpr const char *Name = "list_cosine_similarity"; +struct ListCosineDistanceFun { + static constexpr const char *Name = "list_cosine_distance"; static constexpr const char *Parameters = "list1,list2"; - static constexpr const char *Description = "Compute the cosine similarity between two lists"; - static constexpr const char *Example = "list_cosine_similarity([1, 2, 3], [1, 2, 3])"; + static constexpr const char *Description = "Compute the cosine distance between two lists"; + static constexpr const char *Example = "list_cosine_distance([1, 2, 3], [1, 2, 3])"; static ScalarFunctionSet GetFunctions(); }; -struct ListCosineSimilarityFunAlias { - using ALIAS = ListCosineSimilarityFun; +struct ListCosineDistanceFunAlias { + using ALIAS = ListCosineDistanceFun; static constexpr const char *Name = "<=>"; }; +struct ListCosineSimilarityFun { + static constexpr const char *Name = "list_cosine_similarity"; + static constexpr const char *Parameters = "list1,list2"; + static constexpr const char *Description = "Compute the cosine similarity between two lists"; + static constexpr const char *Example = "list_cosine_similarity([1, 2, 3], [1, 2, 3])"; + + static ScalarFunctionSet GetFunctions(); +}; + struct ListDistanceFun { static constexpr const char *Name = "list_distance"; static constexpr const char *Parameters = "list1,list2"; @@ -306,8 +315,23 @@ struct ListDotProductFun { static constexpr const char *Name = "list_dot_product"; }; -struct ListInnerProductFunAlias { - using ALIAS = ListInnerProductFun; +struct ListNegativeInnerProductFun { + static constexpr const char *Name = "list_negative_inner_product"; + static constexpr const char *Parameters = "list1,list2"; + static constexpr const char *Description = "Compute the negative inner product between two lists"; + static constexpr const char *Example = "list_negative_inner_product([1, 2, 3], [1, 2, 3])"; + + static ScalarFunctionSet GetFunctions(); +}; + +struct ListNegativeDotProductFun { + using ALIAS = ListNegativeInnerProductFun; + + static constexpr const char *Name = "list_negative_dot_product"; +}; + +struct ListNegativeInnerProductFunAlias { + using ALIAS = ListNegativeInnerProductFun; static constexpr const char *Name = "<#>"; }; diff --git a/src/include/duckdb/execution/index/art/iterator.hpp b/src/include/duckdb/execution/index/art/iterator.hpp index 161b65d0b1ff..58a0f106d54d 100644 --- a/src/include/duckdb/execution/index/art/iterator.hpp +++ b/src/include/duckdb/execution/index/art/iterator.hpp @@ -47,7 +47,7 @@ class IteratorKey { //! Returns true, if key_bytes contains all bytes of key. bool Contains(const ARTKey &key) const; //! Returns true, if key_bytes is greater than [or equal to] the key. - bool GreaterThan(const ARTKey &key, bool equal) const; + bool GreaterThan(const ARTKey &key, const bool equal, const uint8_t nested_depth) const; private: unsafe_vector key_bytes; @@ -72,6 +72,11 @@ class Iterator { //! bound exceeds the maximum value of the ART. bool LowerBound(const Node &node, const ARTKey &key, const bool equal, idx_t depth); + //! Returns the nested depth. + uint8_t GetNestedDepth() const { + return nested_depth; + } + private: //! The ART. ART &art; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index 1b4a71f9f783..2c254e265292 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -65,7 +65,7 @@ class CSVError { static CSVError LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info, string &csv_row, idx_t byte_position, const string ¤t_path); //! Produces an error message for a dialect sniffing error. - static CSVError DialectSniffingError(const CSVReaderOptions &options, const string &search_space); + static CSVError SniffingError(const CSVReaderOptions &options, const string &search_space); //! Produces an error message for a header sniffing error. static CSVError HeaderSniffingError(const CSVReaderOptions &options, const vector &best_header_row, idx_t column_count, char delimiter); diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp index 6290226de6e6..8b7cca02d593 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp @@ -40,13 +40,13 @@ struct CSVReaderOptions { //===--------------------------------------------------------------------===// //! See struct above. DialectOptions dialect_options; - //! Whether or not we should ignore InvalidInput errors + //! Whether we should ignore InvalidInput errors CSVOption ignore_errors = false; //! Whether we store CSV Errors in the rejects table or not CSVOption store_rejects = false; //! Rejects table name (Name of the table the store rejects errors) CSVOption rejects_table_name = {"reject_errors"}; - //! Rejects Scan name name (Name of the table the store rejects scans) + //! Rejects Scan name (Name of the table the store rejects scans) CSVOption rejects_scan_name = {"reject_scans"}; //! Rejects table entry limit (0 = no limit) idx_t rejects_limit = 0; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine.hpp index 048dbbf2b1f5..13933a1805e9 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine.hpp @@ -31,6 +31,10 @@ struct CSVStates { (states[1] == CSVState::RECORD_SEPARATOR || states[1] == CSVState::CARRIAGE_RETURN); } + inline bool WasStandard() { + return states[0] == CSVState::STANDARD; + } + inline bool EmptyLastValue() { // It is a new row, if the previous state is not a record separator, and the current one is return states[0] == CSVState::DELIMITER && diff --git a/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp b/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp index 5924ed753bfb..de2448d52687 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp @@ -40,8 +40,8 @@ struct CSVGlobalState : public GlobalTableFunctionState { //! Calculates the Max Threads that will be used by this CSV Reader idx_t MaxThreads() const override; - //! We hold information on the current scanner boundary - CSVIterator current_boundary; + + bool IsDone() const; private: //! Reference to the client context that created this scan @@ -76,6 +76,8 @@ struct CSVGlobalState : public GlobalTableFunctionState { shared_ptr current_buffer_in_use; unordered_map threads_per_file; + //! We hold information on the current scanner boundary + CSVIterator current_boundary; }; } // namespace duckdb diff --git a/src/include/duckdb/function/replacement_scan.hpp b/src/include/duckdb/function/replacement_scan.hpp index edaf455e56b2..75ce069a846e 100644 --- a/src/include/duckdb/function/replacement_scan.hpp +++ b/src/include/duckdb/function/replacement_scan.hpp @@ -10,6 +10,7 @@ #include "duckdb/common/common.hpp" #include "duckdb/common/string_util.hpp" +#include "duckdb/common/enums/file_compression_type.hpp" namespace duckdb { @@ -59,9 +60,9 @@ struct ReplacementScan { static bool CanReplace(const string &table_name, const vector &extensions) { auto lower_name = StringUtil::Lower(table_name); - if (StringUtil::EndsWith(lower_name, ".gz")) { + if (StringUtil::EndsWith(lower_name, CompressionExtensionFromType(FileCompressionType::GZIP))) { lower_name = lower_name.substr(0, lower_name.size() - 3); - } else if (StringUtil::EndsWith(lower_name, ".zst")) { + } else if (StringUtil::EndsWith(lower_name, CompressionExtensionFromType(FileCompressionType::ZSTD))) { lower_name = lower_name.substr(0, lower_name.size() - 4); } diff --git a/src/include/duckdb/function/table_function.hpp b/src/include/duckdb/function/table_function.hpp index 514dae50f98a..68883293bb6d 100644 --- a/src/include/duckdb/function/table_function.hpp +++ b/src/include/duckdb/function/table_function.hpp @@ -223,6 +223,9 @@ typedef unique_ptr (*table_function_deserialize_t)(Deserializer &d typedef void (*table_function_type_pushdown_t)(ClientContext &context, optional_ptr bind_data, const unordered_map &new_column_types); +//! When to call init_global to initialize the table function +enum class TableFunctionInitialization { INITIALIZE_ON_EXECUTE, INITIALIZE_ON_SCHEDULE }; + class TableFunction : public SimpleNamedParameterFunction { // NOLINT: work-around bug in clang-tidy public: DUCKDB_API @@ -301,6 +304,11 @@ class TableFunction : public SimpleNamedParameterFunction { // NOLINT: work-arou //! Additional function info, passed to the bind shared_ptr function_info; + //! When to call init_global + //! By default init_global is called when the pipeline is ready for execution + //! If this is set to `INITIALIZE_ON_SCHEDULE` the table function is initialized when the query is scheduled + TableFunctionInitialization global_initialization = TableFunctionInitialization::INITIALIZE_ON_EXECUTE; + DUCKDB_API bool Equal(const TableFunction &rhs) const; }; diff --git a/src/include/duckdb/main/capi/header_generation/functions/aggregate_functions.json b/src/include/duckdb/main/capi/header_generation/functions/aggregate_functions.json index 7f768f9435bb..5d0243f5ad0c 100644 --- a/src/include/duckdb/main/capi/header_generation/functions/aggregate_functions.json +++ b/src/include/duckdb/main/capi/header_generation/functions/aggregate_functions.json @@ -298,7 +298,7 @@ } ], "comment": { - "description": "Adds the aggregate function as a new overload to the aggregate function set.\n\nReturns DuckDBError if the function could not be added, for example if the overload already exists.", + "description": "Adds the aggregate function as a new overload to the aggregate function set.\n\nReturns DuckDBError if the function could not be added, for example if the overload already exists.\n\n", "param_comments": { "set": "The aggregate function set", "function": "The function to add" diff --git a/src/include/duckdb/main/capi/header_generation/functions/safe_fetch_functions.json b/src/include/duckdb/main/capi/header_generation/functions/safe_fetch_functions.json index d1f9a11e355d..c05ba2c9fcbc 100644 --- a/src/include/duckdb/main/capi/header_generation/functions/safe_fetch_functions.json +++ b/src/include/duckdb/main/capi/header_generation/functions/safe_fetch_functions.json @@ -417,7 +417,7 @@ } ], "comment": { - "description": "**DEPRECATION NOTICE**: use duckdb_value_string instead. This function does not work correctly if the string contains null bytes.\n\n", + "description": "**DEPRECATED**: Use duckdb_value_string instead. This function does not work correctly if the string contains null bytes.\n\n", "return_value": "The text value at the specified location as a null-terminated string, or nullptr if the value cannot be\nconverted. The result must be freed with `duckdb_free`." } }, @@ -461,7 +461,7 @@ } ], "comment": { - "description": "**DEPRECATION NOTICE**: use duckdb_value_string_internal instead. This function does not work correctly if the string contains\nnull bytes.\n\n", + "description": "**DEPRECATED**: Use duckdb_value_string_internal instead. This function does not work correctly if the string contains\nnull bytes.\n\n", "return_value": "The char* value at the specified location. ONLY works on VARCHAR columns and does not auto-cast.\nIf the column is NOT a VARCHAR column this function will return NULL.\n\nThe result must NOT be freed." } }, @@ -483,7 +483,7 @@ } ], "comment": { - "description": "**DEPRECATION NOTICE**: use duckdb_value_string_internal instead. This function does not work correctly if the string contains\nnull bytes.\n", + "description": "**DEPRECATED**: Use duckdb_value_string_internal instead. This function does not work correctly if the string contains\nnull bytes.\n", "return_value": "The char* value at the specified location. ONLY works on VARCHAR columns and does not auto-cast.\nIf the column is NOT a VARCHAR column this function will return NULL.\n\nThe result must NOT be freed." } }, diff --git a/src/include/duckdb/main/capi/header_generation/functions/scalar_functions.json b/src/include/duckdb/main/capi/header_generation/functions/scalar_functions.json index 3794bd28915e..3ef6fe8dfcab 100644 --- a/src/include/duckdb/main/capi/header_generation/functions/scalar_functions.json +++ b/src/include/duckdb/main/capi/header_generation/functions/scalar_functions.json @@ -295,7 +295,7 @@ } ], "comment": { - "description": "Adds the scalar function as a new overload to the scalar function set.\n\nReturns DuckDBError if the function could not be added, for example if the overload already exists.", + "description": "Adds the scalar function as a new overload to the scalar function set.\n\nReturns DuckDBError if the function could not be added, for example if the overload already exists.\n\n", "param_comments": { "set": "The scalar function set", "function": "The function to add" diff --git a/src/include/duckdb/main/capi/header_generation/header_base.hpp.template b/src/include/duckdb/main/capi/header_generation/header_base.hpp.template index 19a514b899a3..fb796c6b68f7 100644 --- a/src/include/duckdb/main/capi/header_generation/header_base.hpp.template +++ b/src/include/duckdb/main/capi/header_generation/header_base.hpp.template @@ -10,11 +10,15 @@ //! duplicate of duckdb/main/winapi.hpp #ifndef DUCKDB_API #ifdef _WIN32 +#ifdef DUCKDB_STATIC_BUILD +#define DUCKDB_API +#else #if defined(DUCKDB_BUILD_LIBRARY) && !defined(DUCKDB_BUILD_LOADABLE_EXTENSION) #define DUCKDB_API __declspec(dllexport) #else #define DUCKDB_API __declspec(dllimport) #endif +#endif #else #define DUCKDB_API #endif @@ -23,11 +27,15 @@ //! duplicate of duckdb/main/winapi.hpp #ifndef DUCKDB_EXTENSION_API #ifdef _WIN32 +#ifdef DUCKDB_STATIC_BUILD +#define DUCKDB_EXTENSION_API +#else #ifdef DUCKDB_BUILD_LOADABLE_EXTENSION #define DUCKDB_EXTENSION_API __declspec(dllexport) #else #define DUCKDB_EXTENSION_API #endif +#endif #else #define DUCKDB_EXTENSION_API __attribute__((visibility("default"))) #endif diff --git a/src/include/duckdb/main/config.hpp b/src/include/duckdb/main/config.hpp index b5bc7ac31416..400f3977c8bd 100644 --- a/src/include/duckdb/main/config.hpp +++ b/src/include/duckdb/main/config.hpp @@ -243,7 +243,9 @@ struct DBConfigOptions { //! Whether to print bindings when printing the plan (debug mode only) static bool debug_print_bindings; // NOLINT: debug setting //! The peak allocation threshold at which to flush the allocator after completing a task (1 << 27, ~128MB) - idx_t allocator_flush_threshold = 134217728; + idx_t allocator_flush_threshold = 134217728ULL; + //! If bulk deallocation larger than this occurs, flush outstanding allocations (1 << 30, ~1GB) + idx_t allocator_bulk_deallocation_flush_threshold = 1073741824ULL; //! Whether the allocator background thread is enabled bool allocator_background_threads = false; //! DuckDB API surface @@ -258,20 +260,18 @@ struct DBConfigOptions { bool abort_on_wal_failure = false; //! The index_scan_percentage sets a threshold for index scans. //! If fewer than MAX(index_scan_max_count, index_scan_percentage * total_row_count) - // rows match, we perform an index scan instead of a table scan. + //! rows match, we perform an index scan instead of a table scan. double index_scan_percentage = 0.001; //! The index_scan_max_count sets a threshold for index scans. //! If fewer than MAX(index_scan_max_count, index_scan_percentage * total_row_count) - // rows match, we perform an index scan instead of a table scan. + //! rows match, we perform an index scan instead of a table scan. idx_t index_scan_max_count = STANDARD_VECTOR_SIZE; - //! Whether or not we initialize table functions in the main thread - //! This is a work-around that exists for certain clients (specifically R) - //! Because those clients do not like it when threads other than the main thread call into R, for e.g., arrow scans - bool initialize_in_main_thread = false; //! The maximum number of schemas we will look through for "did you mean..." style errors in the catalog idx_t catalog_error_max_schemas = 100; //! Whether or not to always write to the WAL file, even if this is not required bool debug_skip_checkpoint_on_commit = false; + //! The maximum amount of vacuum tasks to schedule during a checkpoint + idx_t max_vacuum_tasks = 100; bool operator==(const DBConfigOptions &other) const; }; diff --git a/src/include/duckdb/main/extension_entries.hpp b/src/include/duckdb/main/extension_entries.hpp index 76e4aa70ccb7..7be9db0c057f 100644 --- a/src/include/duckdb/main/extension_entries.hpp +++ b/src/include/duckdb/main/extension_entries.hpp @@ -241,6 +241,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"postgres_scan", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"postgres_scan_pushdown", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"pragma_hnsw_index_info", "vss", CatalogType::TABLE_FUNCTION_ENTRY}, + {"pragma_rtree_index_info", "spatial", CatalogType::TABLE_FUNCTION_ENTRY}, {"read_json", "json", CatalogType::TABLE_FUNCTION_ENTRY}, {"read_json_auto", "json", CatalogType::TABLE_FUNCTION_ENTRY}, {"read_json_objects", "json", CatalogType::TABLE_FUNCTION_ENTRY}, @@ -251,6 +252,8 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"read_parquet", "parquet", CatalogType::TABLE_FUNCTION_ENTRY}, {"reduce_sql_statement", "sqlsmith", CatalogType::TABLE_FUNCTION_ENTRY}, {"row_to_json", "json", CatalogType::SCALAR_FUNCTION_ENTRY}, + {"rtree_index_dump", "spatial", CatalogType::TABLE_FUNCTION_ENTRY}, + {"rtree_index_scan", "spatial", CatalogType::TABLE_FUNCTION_ENTRY}, {"scan_arrow_ipc", "arrow", CatalogType::TABLE_FUNCTION_ENTRY}, {"shapefile_meta", "spatial", CatalogType::TABLE_FUNCTION_ENTRY}, {"sql_auto_complete", "autocomplete", CatalogType::TABLE_FUNCTION_ENTRY}, @@ -261,6 +264,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"st_area_spheroid", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_asgeojson", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_ashexwkb", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, + {"st_assvg", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_astext", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_aswkb", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_boundary", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, @@ -289,12 +293,14 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"st_envelope_agg", "spatial", CatalogType::AGGREGATE_FUNCTION_ENTRY}, {"st_equals", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_extent", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, + {"st_extent_approx", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_exteriorring", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_flipcoordinates", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_force2d", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_force3dm", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_force3dz", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_force4d", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, + {"st_generatepoints", "spatial", CatalogType::TABLE_FUNCTION_ENTRY}, {"st_geometrytype", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_geomfromgeojson", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, {"st_geomfromhexewkb", "spatial", CatalogType::SCALAR_FUNCTION_ENTRY}, @@ -421,6 +427,7 @@ static constexpr ExtensionEntry EXTENSION_SETTINGS[] = { {"pg_connection_limit", "postgres_scanner"}, {"pg_debug_show_queries", "postgres_scanner"}, {"pg_experimental_filter_pushdown", "postgres_scanner"}, + {"pg_null_byte_replacement", "postgres_scanner"}, {"pg_pages_per_task", "postgres_scanner"}, {"pg_use_binary_copy", "postgres_scanner"}, {"pg_use_ctid_scan", "postgres_scanner"}, diff --git a/src/include/duckdb/main/extension_helper.hpp b/src/include/duckdb/main/extension_helper.hpp index 0d27c160dbdf..d9e2b86ea1a8 100644 --- a/src/include/duckdb/main/extension_helper.hpp +++ b/src/include/duckdb/main/extension_helper.hpp @@ -76,6 +76,20 @@ struct ExtensionUpdateResult { string installed_version; }; +struct ExtensionInstallOptions { + //! Install from a different repository that the default one + optional_ptr repository; + //! Install a specific version of the extension + string version; + + //! Overwrite existing installation + bool force_install = false; + //! Use etags to avoid downloading unchanged extension files + bool use_etags = false; + //! Throw an error when installing an extension with a different origin than the one that is installed + bool throw_on_origin_mismatch = false; +}; + class ExtensionHelper { public: static void LoadAllExtensions(DuckDB &db); @@ -84,15 +98,9 @@ class ExtensionHelper { //! Install an extension static unique_ptr InstallExtension(ClientContext &context, const string &extension, - bool force_install, - optional_ptr repository = nullptr, - bool throw_on_origin_mismatch = false, - const string &version = ""); + ExtensionInstallOptions &options); static unique_ptr InstallExtension(DatabaseInstance &db, FileSystem &fs, - const string &extension, bool force_install, - optional_ptr repository = nullptr, - bool throw_on_origin_mismatch = false, - const string &version = ""); + const string &extension, ExtensionInstallOptions &options); //! Load an extension static void LoadExternalExtension(ClientContext &context, const string &extension); static void LoadExternalExtension(DatabaseInstance &db, FileSystem &fs, const string &extension); @@ -212,10 +220,11 @@ class ExtensionHelper { static bool CreateSuggestions(const string &extension_name, string &message); private: - static unique_ptr InstallExtensionInternal( - DatabaseInstance &db, FileSystem &fs, const string &local_path, const string &extension, bool force_install, - bool throw_on_origin_mismatch, const string &version, optional_ptr repository, - optional_ptr http_logger = nullptr, optional_ptr context = nullptr); + static unique_ptr InstallExtensionInternal(DatabaseInstance &db, FileSystem &fs, + const string &local_path, const string &extension, + ExtensionInstallOptions &options, + optional_ptr http_logger = nullptr, + optional_ptr context = nullptr); static const vector PathComponents(); static string DefaultExtensionFolder(FileSystem &fs); static bool AllowAutoInstall(const string &extension); diff --git a/src/include/duckdb/main/settings.hpp b/src/include/duckdb/main/settings.hpp index d63c128d84b9..538a75b85e4f 100644 --- a/src/include/duckdb/main/settings.hpp +++ b/src/include/duckdb/main/settings.hpp @@ -603,6 +603,15 @@ struct MaximumTempDirectorySize { static Value GetSetting(const ClientContext &context); }; +struct MaximumVacuumTasks { + static constexpr const char *Name = "max_vacuum_tasks"; + static constexpr const char *Description = "The maximum vacuum tasks to schedule during a checkpoint"; + static constexpr const LogicalTypeId InputType = LogicalTypeId::UBIGINT; + static void SetGlobal(DatabaseInstance *db, DBConfig &config, const Value ¶meter); + static void ResetGlobal(DatabaseInstance *db, DBConfig &config); + static Value GetSetting(const ClientContext &context); +}; + struct MergeJoinThreshold { static constexpr const char *Name = "merge_join_threshold"; static constexpr const char *Description = "The number of rows we need on either table to choose a merge join"; @@ -877,7 +886,7 @@ struct UsernameSetting { static Value GetSetting(const ClientContext &context); }; -struct FlushAllocatorSetting { +struct AllocatorFlushThreshold { static constexpr const char *Name = "allocator_flush_threshold"; static constexpr const char *Description = "Peak allocation threshold at which to flush the allocator after completing a task."; @@ -887,6 +896,16 @@ struct FlushAllocatorSetting { static Value GetSetting(const ClientContext &context); }; +struct AllocatorBulkDeallocationFlushThreshold { + static constexpr const char *Name = "allocator_bulk_deallocation_flush_threshold"; + static constexpr const char *Description = + "If a bulk deallocation larger than this occurs, flush outstanding allocations."; + static constexpr const LogicalTypeId InputType = LogicalTypeId::VARCHAR; + static void SetGlobal(DatabaseInstance *db, DBConfig &config, const Value ¶meter); + static void ResetGlobal(DatabaseInstance *db, DBConfig &config); + static Value GetSetting(const ClientContext &context); +}; + struct AllocatorBackgroundThreadsSetting { static constexpr const char *Name = "allocator_background_threads"; static constexpr const char *Description = "Whether to enable the allocator background thread."; diff --git a/src/include/duckdb/parser/transformer.hpp b/src/include/duckdb/parser/transformer.hpp index 0646660ce7e9..760526b15fa7 100644 --- a/src/include/duckdb/parser/transformer.hpp +++ b/src/include/duckdb/parser/transformer.hpp @@ -112,9 +112,8 @@ class Transformer { // Statement transformation //===--------------------------------------------------------------------===// //! Transform a Postgres duckdb_libpgquery::T_PGSelectStmt node into a SelectStatement - unique_ptr TransformSelect(optional_ptr node, bool is_select = true); - //! Transform a Postgres duckdb_libpgquery::T_PGSelectStmt node into a SelectStatement - unique_ptr TransformSelect(duckdb_libpgquery::PGSelectStmt &select, bool is_select = true); + unique_ptr TransformSelectStmt(duckdb_libpgquery::PGSelectStmt &select, bool is_select = true); + unique_ptr TransformSelectStmt(duckdb_libpgquery::PGNode &node, bool is_select = true); //! Transform a Postgres T_AlterStmt node into a AlterStatement unique_ptr TransformAlter(duckdb_libpgquery::PGAlterTableStmt &stmt); //! Transform a Postgres duckdb_libpgquery::T_PGRenameStmt node into a RenameStatement @@ -170,8 +169,10 @@ class Transformer { unique_ptr TransformImport(duckdb_libpgquery::PGImportStmt &stmt); unique_ptr TransformExplain(duckdb_libpgquery::PGExplainStmt &stmt); unique_ptr TransformVacuum(duckdb_libpgquery::PGVacuumStmt &stmt); - unique_ptr TransformShow(duckdb_libpgquery::PGVariableShowStmt &stmt); - unique_ptr TransformShowSelect(duckdb_libpgquery::PGVariableShowSelectStmt &stmt); + unique_ptr TransformShow(duckdb_libpgquery::PGVariableShowStmt &stmt); + unique_ptr TransformShowStmt(duckdb_libpgquery::PGVariableShowStmt &stmt); + unique_ptr TransformShowSelect(duckdb_libpgquery::PGVariableShowSelectStmt &stmt); + unique_ptr TransformShowSelectStmt(duckdb_libpgquery::PGVariableShowSelectStmt &stmt); unique_ptr TransformAttach(duckdb_libpgquery::PGAttachStmt &stmt); unique_ptr TransformDetach(duckdb_libpgquery::PGDetachStmt &stmt); unique_ptr TransformUse(duckdb_libpgquery::PGUseStmt &stmt); @@ -204,7 +205,8 @@ class Transformer { // Query Node Transform //===--------------------------------------------------------------------===// //! Transform a Postgres duckdb_libpgquery::T_PGSelectStmt node into a QueryNode - unique_ptr TransformSelectNode(duckdb_libpgquery::PGSelectStmt &select); + unique_ptr TransformSelectNode(duckdb_libpgquery::PGNode &select, bool is_select = true); + unique_ptr TransformSelectNodeInternal(duckdb_libpgquery::PGSelectStmt &select, bool is_select = true); unique_ptr TransformSelectInternal(duckdb_libpgquery::PGSelectStmt &select); void TransformModifiers(duckdb_libpgquery::PGSelectStmt &stmt, QueryNode &node); diff --git a/src/include/duckdb/planner/expression_binder.hpp b/src/include/duckdb/planner/expression_binder.hpp index c454be641482..eded041bc8e1 100644 --- a/src/include/duckdb/planner/expression_binder.hpp +++ b/src/include/duckdb/planner/expression_binder.hpp @@ -154,8 +154,6 @@ class ExpressionBinder { static LogicalType GetExpressionReturnType(const Expression &expr); private: - //! Maximum stack depth - static constexpr const idx_t MAXIMUM_STACK_DEPTH = 128; //! Current stack depth idx_t stack_depth = DConstants::INVALID_INDEX; @@ -204,6 +202,8 @@ class ExpressionBinder { virtual BindResult BindUnnest(FunctionExpression &expr, idx_t depth, bool root_expression); virtual BindResult BindMacro(FunctionExpression &expr, ScalarMacroCatalogEntry ¯o, idx_t depth, unique_ptr &expr_ptr); + void UnfoldMacroExpression(FunctionExpression &function, ScalarMacroCatalogEntry ¯o_func, + unique_ptr &expr); virtual string UnsupportedAggregateMessage(); virtual string UnsupportedUnnestMessage(); diff --git a/src/include/duckdb/planner/logical_operator.hpp b/src/include/duckdb/planner/logical_operator.hpp index ffdc898f6367..a6cc867e1508 100644 --- a/src/include/duckdb/planner/logical_operator.hpp +++ b/src/include/duckdb/planner/logical_operator.hpp @@ -65,6 +65,7 @@ class LogicalOperator { void AddChild(unique_ptr child); virtual idx_t EstimateCardinality(ClientContext &context); void SetEstimatedCardinality(idx_t _estimated_cardinality); + void SetParamsEstimatedCardinality(InsertionOrderPreservingMap &result) const; virtual void Serialize(Serializer &serializer) const; static unique_ptr Deserialize(Deserializer &deserializer); diff --git a/src/include/duckdb/storage/buffer/buffer_pool.hpp b/src/include/duckdb/storage/buffer/buffer_pool.hpp index edc79c6c20c4..bf26f80b3d6c 100644 --- a/src/include/duckdb/storage/buffer/buffer_pool.hpp +++ b/src/include/duckdb/storage/buffer/buffer_pool.hpp @@ -48,6 +48,9 @@ class BufferPool { //! blocks can be evicted void SetLimit(idx_t limit, const char *exception_postscript); + //! If bulk deallocation larger than this occurs, flush outstanding allocations + void SetAllocatorBulkDeallocationFlushThreshold(idx_t threshold); + void UpdateUsedMemory(MemoryTag tag, int64_t size); idx_t GetUsedMemory() const; @@ -135,6 +138,8 @@ class BufferPool { mutex limit_lock; //! The maximum amount of memory that the buffer manager can keep (in bytes) atomic maximum_memory; + //! If bulk deallocation larger than this occurs, flush outstanding allocations + atomic allocator_bulk_deallocation_flush_threshold; //! Record timestamps of buffer manager unpin() events. Usable by custom eviction policies. bool track_eviction_timestamps; //! Eviction queues diff --git a/src/include/duckdb/storage/checkpoint/table_data_writer.hpp b/src/include/duckdb/storage/checkpoint/table_data_writer.hpp index dfcfec47af85..a606b5615197 100644 --- a/src/include/duckdb/storage/checkpoint/table_data_writer.hpp +++ b/src/include/duckdb/storage/checkpoint/table_data_writer.hpp @@ -38,6 +38,7 @@ class TableDataWriter { virtual CheckpointType GetCheckpointType() const = 0; TaskScheduler &GetScheduler(); + DatabaseInstance &GetDatabase(); protected: DuckTableEntry &table; diff --git a/src/include/duckdb/storage/serialization/nodes.json b/src/include/duckdb/storage/serialization/nodes.json index b503661651cb..c29cb8cb628a 100644 --- a/src/include/duckdb/storage/serialization/nodes.json +++ b/src/include/duckdb/storage/serialization/nodes.json @@ -802,7 +802,7 @@ {"id": 137, "name": "dialect_options.state_machine_options.comment", "type": "CSVOption", - "default": "CSVOption()" + "default": "CSVOption('\\0')" }, {"id": 138, "name": "dialect_options.rows_until_header", diff --git a/src/include/duckdb/storage/table/data_table_info.hpp b/src/include/duckdb/storage/table/data_table_info.hpp index e7fa3bd3757a..fbc2e443b7c0 100644 --- a/src/include/duckdb/storage/table/data_table_info.hpp +++ b/src/include/duckdb/storage/table/data_table_info.hpp @@ -44,6 +44,9 @@ struct DataTableInfo { const vector &GetIndexStorageInfo() const { return index_storage_infos; } + unique_ptr GetSharedLock() { + return checkpoint_lock.GetSharedLock(); + } string GetSchemaName(); string GetTableName(); diff --git a/src/include/duckdb/storage/table/scan_state.hpp b/src/include/duckdb/storage/table/scan_state.hpp index 21cb7be2ec97..bd7d03771635 100644 --- a/src/include/duckdb/storage/table/scan_state.hpp +++ b/src/include/duckdb/storage/table/scan_state.hpp @@ -206,6 +206,15 @@ struct TableScanOptions { bool force_fetch_row = false; }; +class CheckpointLock { +public: + explicit CheckpointLock(unique_ptr lock_p) : lock(std::move(lock_p)) { + } + +private: + unique_ptr lock; +}; + class TableScanState { public: TableScanState(); @@ -218,7 +227,7 @@ class TableScanState { //! Options for scanning TableScanOptions options; //! Shared lock over the checkpoint to prevent checkpoints while reading - unique_ptr checkpoint_lock; + shared_ptr checkpoint_lock; //! Filter info ScanFilterInfo filters; @@ -253,7 +262,7 @@ struct ParallelTableScanState { //! Parallel scan state for the transaction-local state ParallelCollectionScanState local_state; //! Shared lock over the checkpoint to prevent checkpoints while reading - unique_ptr checkpoint_lock; + shared_ptr checkpoint_lock; }; struct PrefetchState { diff --git a/src/include/duckdb/transaction/duck_transaction.hpp b/src/include/duckdb/transaction/duck_transaction.hpp index 63dbc0226348..efc236211a85 100644 --- a/src/include/duckdb/transaction/duck_transaction.hpp +++ b/src/include/duckdb/transaction/duck_transaction.hpp @@ -12,11 +12,13 @@ #include "duckdb/common/reference_map.hpp" namespace duckdb { +class CheckpointLock; class RowGroupCollection; class RowVersionManager; class DuckTransactionManager; class StorageLockKey; class StorageCommitState; +struct DataTableInfo; struct UndoBufferProperties; class DuckTransaction : public Transaction { @@ -79,6 +81,9 @@ class DuckTransaction : public Transaction { void UpdateCollection(shared_ptr &collection); + //! Get a shared lock on a table + shared_ptr SharedLockTable(DataTableInfo &info); + private: DuckTransactionManager &transaction_manager; //! The undo buffer is used to store old versions of rows that are updated @@ -94,6 +99,10 @@ class DuckTransaction : public Transaction { reference_map_t> sequence_usage; //! Collections that are updated by this transaction reference_map_t> updated_collections; + //! Lock for the active_locks map + mutex active_locks_lock; + //! Active locks on tables + reference_map_t> active_locks; }; } // namespace duckdb diff --git a/src/main/buffered_data/simple_buffered_data.cpp b/src/main/buffered_data/simple_buffered_data.cpp index dee909849762..4b6a3a534177 100644 --- a/src/main/buffered_data/simple_buffered_data.cpp +++ b/src/main/buffered_data/simple_buffered_data.cpp @@ -53,7 +53,9 @@ StreamExecutionResult SimpleBufferedData::ExecuteTaskInternal(StreamQueryResult if (!cc) { return StreamExecutionResult::EXECUTION_CANCELLED; } - + if (!cc->IsActiveResult(context_lock, result)) { + return StreamExecutionResult::EXECUTION_CANCELLED; + } if (BufferIsFull()) { // The buffer isn't empty yet, just return return StreamExecutionResult::CHUNK_READY; diff --git a/src/main/config.cpp b/src/main/config.cpp index 454c00b2bab0..25af4eeed3a1 100644 --- a/src/main/config.cpp +++ b/src/main/config.cpp @@ -2,8 +2,8 @@ #include "duckdb/common/cgroups.hpp" #include "duckdb/common/file_system.hpp" -#include "duckdb/common/operator/multiply.hpp" #include "duckdb/common/operator/cast_operators.hpp" +#include "duckdb/common/operator/multiply.hpp" #include "duckdb/common/string_util.hpp" #include "duckdb/main/settings.hpp" #include "duckdb/storage/storage_extension.hpp" @@ -111,6 +111,7 @@ static const ConfigurationOption internal_options[] = { DUCKDB_LOCAL(StreamingBufferSize), DUCKDB_GLOBAL(MaximumMemorySetting), DUCKDB_GLOBAL(MaximumTempDirectorySize), + DUCKDB_GLOBAL(MaximumVacuumTasks), DUCKDB_LOCAL(MergeJoinThreshold), DUCKDB_LOCAL(NestedLoopJoinThreshold), DUCKDB_GLOBAL(OldImplicitCasting), @@ -144,7 +145,8 @@ static const ConfigurationOption internal_options[] = { DUCKDB_GLOBAL_ALIAS("user", UsernameSetting), DUCKDB_GLOBAL_ALIAS("wal_autocheckpoint", CheckpointThresholdSetting), DUCKDB_GLOBAL_ALIAS("worker_threads", ThreadsSetting), - DUCKDB_GLOBAL(FlushAllocatorSetting), + DUCKDB_GLOBAL(AllocatorFlushThreshold), + DUCKDB_GLOBAL(AllocatorBulkDeallocationFlushThreshold), DUCKDB_GLOBAL(AllocatorBackgroundThreadsSetting), DUCKDB_GLOBAL(DuckDBApiSetting), DUCKDB_GLOBAL(CustomUserAgentSetting), diff --git a/src/main/extension/extension_helper.cpp b/src/main/extension/extension_helper.cpp index c821caedeaa0..494832417e3f 100644 --- a/src/main/extension/extension_helper.cpp +++ b/src/main/extension/extension_helper.cpp @@ -213,7 +213,9 @@ bool ExtensionHelper::TryAutoLoadExtension(ClientContext &context, const string if (dbconfig.options.autoinstall_known_extensions) { auto &config = DBConfig::GetConfig(context); auto autoinstall_repo = ExtensionRepository::GetRepositoryByUrl(config.options.autoinstall_extension_repo); - ExtensionHelper::InstallExtension(context, extension_name, false, autoinstall_repo, false); + ExtensionInstallOptions options; + options.repository = autoinstall_repo; + ExtensionHelper::InstallExtension(context, extension_name, options); } ExtensionHelper::LoadExternalExtension(context, extension_name); return true; @@ -232,7 +234,9 @@ bool ExtensionHelper::TryAutoLoadExtension(DatabaseInstance &instance, const str if (dbconfig.options.autoinstall_known_extensions) { auto autoinstall_repo = ExtensionRepository::GetRepositoryByUrl(dbconfig.options.autoinstall_extension_repo); - ExtensionHelper::InstallExtension(instance, fs, extension_name, false, autoinstall_repo, false); + ExtensionInstallOptions options; + options.repository = autoinstall_repo; + ExtensionHelper::InstallExtension(instance, fs, extension_name, options); } ExtensionHelper::LoadExternalExtension(instance, fs, extension_name); return true; @@ -291,10 +295,15 @@ static ExtensionUpdateResult UpdateExtensionInternal(ClientContext &context, Dat auto repository_from_info = ExtensionRepository::GetRepositoryByUrl(extension_install_info->repository_url); result.repository = repository_from_info.ToReadableString(); - // We force install the full url found in this file, throwing + // Force install the full url found in this file, enabling etags to ensure efficient updating + ExtensionInstallOptions options; + options.repository = repository_from_info; + options.force_install = true; + options.use_etags = true; + unique_ptr install_result; try { - install_result = ExtensionHelper::InstallExtension(context, extension_name, true, repository_from_info); + install_result = ExtensionHelper::InstallExtension(context, extension_name, options); } catch (std::exception &e) { ErrorData error(e); error.Throw("Extension updating failed when trying to install '" + extension_name + "', original error: "); @@ -375,7 +384,9 @@ void ExtensionHelper::AutoLoadExtension(DatabaseInstance &db, const string &exte if (dbconfig.options.autoinstall_known_extensions) { //! Get the autoloading repository auto repository = ExtensionRepository::GetRepositoryByUrl(dbconfig.options.autoinstall_extension_repo); - ExtensionHelper::InstallExtension(db, *fs, extension_name, false, repository); + ExtensionInstallOptions options; + options.repository = repository; + ExtensionHelper::InstallExtension(db, *fs, extension_name, options); } #endif ExtensionHelper::LoadExternalExtension(db, *fs, extension_name); @@ -759,17 +770,6 @@ EMS5gLv50CzQqJXK9mNzPuYXNUIc4Pw4ssVWe0OfN3Od90gl5uFUwk/G9lWSYnBN static const char *const community_public_keys[] = { R"( -----BEGIN PUBLIC KEY----- -MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAv+Jki3aiZt0eOzShgD2g -BYPjPpkhHowOwPzUKtTVPob7vxyzd2wPyWDF/Zn6sN8QzravAdlXFE3SNF7ayO86 -IPHhMxO6P2YlxbipyKzPOUJsasXBiwYw2aSvb0RtwnYwD5lJs8Tz2ET1RQCFgXGc -LW7bDjKRbHSME0Me5rLRWVztOqULeoMeY1oCOmKKeAYxjFOASJJfQF9oQxkuu3j1 -qpcXnfHldlPGzFM77OFlWFtlc9QW4WNoxkO3HwskFW6ZRaQipM8vgSzkIfPFESGL -TtDRw+RcUPqmS6NVW8nhaiptBIMXy+9cP/l1LGmGwrZRhWP0YBlk6V9MUMzjyo+R -JQIDAQAB ------END PUBLIC KEY----- -)", - R"( ------BEGIN PUBLIC KEY----- MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAtXl28loGwAH3ZGQXXgJQ 3omhIEiUb3z9Petjl+jmdtEQnMNUFEZiXkfJB02UFWBL1OoKKnjiGhcr5oGiIZKR CoaL6SfmWe//7o8STM44stE0exzZcv8W4tWwjrzSWQnwh2JgSnHN64xoDQjdvG3X diff --git a/src/main/extension/extension_install.cpp b/src/main/extension/extension_install.cpp index 609de40c5eb7..4c21da0433c7 100644 --- a/src/main/extension/extension_install.cpp +++ b/src/main/extension/extension_install.cpp @@ -142,24 +142,18 @@ bool ExtensionHelper::CreateSuggestions(const string &extension_name, string &me } unique_ptr ExtensionHelper::InstallExtension(DatabaseInstance &db, FileSystem &fs, - const string &extension, bool force_install, - optional_ptr repository, - bool throw_on_origin_mismatch, - const string &version) { + const string &extension, + ExtensionInstallOptions &options) { #ifdef WASM_LOADABLE_EXTENSIONS // Install is currently a no-op return nullptr; #endif string local_path = ExtensionDirectory(db, fs); - return InstallExtensionInternal(db, fs, local_path, extension, force_install, throw_on_origin_mismatch, version, - repository); + return InstallExtensionInternal(db, fs, local_path, extension, options); } unique_ptr ExtensionHelper::InstallExtension(ClientContext &context, const string &extension, - bool force_install, - optional_ptr repository, - bool throw_on_origin_mismatch, - const string &version) { + ExtensionInstallOptions &options) { #ifdef WASM_LOADABLE_EXTENSIONS // Install is currently a no-op return nullptr; @@ -169,8 +163,7 @@ unique_ptr ExtensionHelper::InstallExtension(ClientContext string local_path = ExtensionDirectory(context); optional_ptr http_logger = ClientConfig::GetConfig(context).enable_http_logging ? context.client_data->http_logger.get() : nullptr; - return InstallExtensionInternal(db, fs, local_path, extension, force_install, throw_on_origin_mismatch, version, - repository, http_logger, context); + return InstallExtensionInternal(db, fs, local_path, extension, options, http_logger, context); } unsafe_unique_array ReadExtensionFileFromDisk(FileSystem &fs, const string &path, idx_t &file_size) { @@ -209,7 +202,7 @@ string ExtensionHelper::ExtensionUrlTemplate(optional_ptr DirectInstallExtension(DatabaseInstance &db, FileSystem &fs, const string &path, const string &temp_path, const string &extension_name, - const string &local_extension_path, bool force_install, - optional_ptr repository, + const string &local_extension_path, + ExtensionInstallOptions &options, optional_ptr context) { string file = fs.ConvertSeparators(path); @@ -290,7 +283,7 @@ static unique_ptr DirectInstallExtension(DatabaseInstance bool exists = fs.FileExists(file); // Recheck without .gz - if (!exists && StringUtil::EndsWith(file, ".gz")) { + if (!exists && StringUtil::EndsWith(file, CompressionExtensionFromType(FileCompressionType::GZIP))) { file = file.substr(0, file.size() - 3); exists = fs.FileExists(file); } @@ -325,13 +318,13 @@ static unique_ptr DirectInstallExtension(DatabaseInstance CheckExtensionMetadataOnInstall(db, extension_decompressed, extension_decompressed_size, info, extension_name); - if (!repository) { + if (!options.repository) { info.mode = ExtensionInstallMode::CUSTOM_PATH; info.full_path = file; } else { info.mode = ExtensionInstallMode::REPOSITORY; info.full_path = file; - info.repository_url = repository->path; + info.repository_url = options.repository->path; } WriteExtensionFiles(fs, temp_path, local_extension_path, extension_decompressed, extension_decompressed_size, info); @@ -342,8 +335,8 @@ static unique_ptr DirectInstallExtension(DatabaseInstance #ifndef DUCKDB_DISABLE_EXTENSION_LOAD static unique_ptr InstallFromHttpUrl(DatabaseInstance &db, const string &url, const string &extension_name, const string &temp_path, - const string &local_extension_path, bool force_install, - optional_ptr repository, + const string &local_extension_path, + ExtensionInstallOptions &options, optional_ptr http_logger) { string no_http = StringUtil::Replace(url, "http://", ""); @@ -391,7 +384,7 @@ static unique_ptr InstallFromHttpUrl(DatabaseInstance &db, duckdb_httplib::Headers headers = { {"User-Agent", StringUtil::Format("%s %s", db.config.UserAgent(), DuckDB::SourceID())}}; - if (!force_install && install_info && !install_info->etag.empty()) { + if (options.use_etags && install_info && !install_info->etag.empty()) { headers.insert({"If-None-Match", StringUtil::Format("%s", install_info->etag)}); } @@ -456,10 +449,10 @@ static unique_ptr InstallFromHttpUrl(DatabaseInstance &db, info.etag = res->get_header_value("ETag"); } - if (repository) { + if (options.repository) { info.mode = ExtensionInstallMode::REPOSITORY; info.full_path = url; - info.repository_url = repository->path; + info.repository_url = options.repository->path; } else { info.mode = ExtensionInstallMode::CUSTOM_PATH; info.full_path = url; @@ -473,24 +466,22 @@ static unique_ptr InstallFromHttpUrl(DatabaseInstance &db, } // Install an extension using a hand-rolled http request -static unique_ptr InstallFromRepository(DatabaseInstance &db, FileSystem &fs, const string &url, - const string &extension_name, - ExtensionRepository &repository, const string &temp_path, - const string &local_extension_path, const string &version, - bool force_install, optional_ptr http_logger, - optional_ptr context) { - string url_template = ExtensionHelper::ExtensionUrlTemplate(db, repository, version); +static unique_ptr +InstallFromRepository(DatabaseInstance &db, FileSystem &fs, const string &url, const string &extension_name, + const string &temp_path, const string &local_extension_path, ExtensionInstallOptions &options, + optional_ptr http_logger, optional_ptr context) { + string url_template = ExtensionHelper::ExtensionUrlTemplate(db, *options.repository, options.version); string generated_url = ExtensionHelper::ExtensionFinalizeUrlTemplate(url_template, extension_name); // Special handling for http repository: avoid using regular filesystem (note: the filesystem is not used here) - if (StringUtil::StartsWith(repository.path, "http://")) { - return InstallFromHttpUrl(db, generated_url, extension_name, temp_path, local_extension_path, force_install, - repository, http_logger); + if (StringUtil::StartsWith(options.repository->path, "http://")) { + return InstallFromHttpUrl(db, generated_url, extension_name, temp_path, local_extension_path, options, + http_logger); } // Default case, let the FileSystem figure it out - return DirectInstallExtension(db, fs, generated_url, temp_path, extension_name, local_extension_path, force_install, - repository, context); + return DirectInstallExtension(db, fs, generated_url, temp_path, extension_name, local_extension_path, options, + context); } static bool IsHTTP(const string &path) { @@ -526,8 +517,7 @@ static void ThrowErrorOnMismatchingExtensionOrigin(FileSystem &fs, const string unique_ptr ExtensionHelper::InstallExtensionInternal(DatabaseInstance &db, FileSystem &fs, const string &local_path, - const string &extension, bool force_install, bool throw_on_origin_mismatch, - const string &version, optional_ptr repository, + const string &extension, ExtensionInstallOptions &options, optional_ptr http_logger, optional_ptr context) { #ifdef DUCKDB_DISABLE_EXTENSION_LOAD throw PermissionException("Installing external extensions is disabled through a compile time flag"); @@ -540,11 +530,12 @@ ExtensionHelper::InstallExtensionInternal(DatabaseInstance &db, FileSystem &fs, string local_extension_path = fs.JoinPath(local_path, extension_name + ".duckdb_extension"); string temp_path = local_extension_path + ".tmp-" + UUID::ToString(UUID::GenerateRandomUUID()); - if (fs.FileExists(local_extension_path) && !force_install) { + if (fs.FileExists(local_extension_path) && !options.force_install) { // File exists: throw error if origin mismatches - if (throw_on_origin_mismatch && !db.config.options.allow_extensions_metadata_mismatch && + if (options.throw_on_origin_mismatch && !db.config.options.allow_extensions_metadata_mismatch && fs.FileExists(local_extension_path + ".info")) { - ThrowErrorOnMismatchingExtensionOrigin(fs, local_extension_path, extension_name, extension, repository); + ThrowErrorOnMismatchingExtensionOrigin(fs, local_extension_path, extension_name, extension, + options.repository); } // File exists, but that's okay, install is now a NOP @@ -555,29 +546,29 @@ ExtensionHelper::InstallExtensionInternal(DatabaseInstance &db, FileSystem &fs, fs.RemoveFile(temp_path); } - if (ExtensionHelper::IsFullPath(extension) && repository) { + if (ExtensionHelper::IsFullPath(extension) && options.repository) { throw InvalidInputException("Cannot pass both a repository and a full path url"); } // Resolve default repository if there is none set ExtensionRepository resolved_repository; - if (!ExtensionHelper::IsFullPath(extension) && !repository) { + if (!ExtensionHelper::IsFullPath(extension) && !options.repository) { resolved_repository = ExtensionRepository::GetDefaultRepository(db.config); - repository = resolved_repository; + options.repository = resolved_repository; } // Install extension from local, direct url if (ExtensionHelper::IsFullPath(extension) && !IsHTTP(extension)) { LocalFileSystem local_fs; - return DirectInstallExtension(db, local_fs, extension, temp_path, extension, local_extension_path, - force_install, nullptr, context); + return DirectInstallExtension(db, local_fs, extension, temp_path, extension, local_extension_path, options, + context); } // Install extension from local url based on a repository (Note that this will install it as a local file) - if (repository && !IsHTTP(repository->path)) { + if (options.repository && !IsHTTP(options.repository->path)) { LocalFileSystem local_fs; - return InstallFromRepository(db, fs, extension, extension_name, *repository, temp_path, local_extension_path, - version, force_install, http_logger, context); + return InstallFromRepository(db, fs, extension, extension_name, temp_path, local_extension_path, options, + http_logger, context); } #ifdef DISABLE_DUCKDB_REMOTE_INSTALL @@ -588,18 +579,17 @@ ExtensionHelper::InstallExtensionInternal(DatabaseInstance &db, FileSystem &fs, if (IsFullPath(extension)) { if (StringUtil::StartsWith(extension, "http://")) { // HTTP takes separate path to avoid dependency on httpfs extension - return InstallFromHttpUrl(db, extension, extension_name, temp_path, local_extension_path, force_install, - nullptr, http_logger); + return InstallFromHttpUrl(db, extension, extension_name, temp_path, local_extension_path, options, + http_logger); } // Direct installation from local or remote path - return DirectInstallExtension(db, fs, extension, temp_path, extension, local_extension_path, force_install, - nullptr, context); + return DirectInstallExtension(db, fs, extension, temp_path, extension, local_extension_path, options, context); } // Repository installation - return InstallFromRepository(db, fs, extension, extension_name, *repository, temp_path, local_extension_path, - version, force_install, http_logger, context); + return InstallFromRepository(db, fs, extension, extension_name, temp_path, local_extension_path, options, + http_logger, context); #endif #endif } diff --git a/src/main/extension/extension_load.cpp b/src/main/extension/extension_load.cpp index 7fe4fcb3db14..d7168ff9c4ea 100644 --- a/src/main/extension/extension_load.cpp +++ b/src/main/extension/extension_load.cpp @@ -453,7 +453,8 @@ ExtensionInitResult ExtensionHelper::InitialLoad(DatabaseInstance &db, FileSyste throw IOException(error); } // the extension load failed - try installing the extension - ExtensionHelper::InstallExtension(db, fs, extension, false); + ExtensionInstallOptions options; + ExtensionHelper::InstallExtension(db, fs, extension, options); // try loading again if (!TryInitialLoad(db, fs, extension, result, error)) { throw IOException(error); diff --git a/src/main/query_profiler.cpp b/src/main/query_profiler.cpp index 801496b15bd9..98b3bd108e49 100644 --- a/src/main/query_profiler.cpp +++ b/src/main/query_profiler.cpp @@ -684,11 +684,12 @@ void QueryProfiler::WriteToFile(const char *path, string &info) const { } } -profiler_settings_t ErasePhaseTimingSettings(profiler_settings_t settings) { +profiler_settings_t EraseQueryRootSettings(profiler_settings_t settings) { profiler_settings_t phase_timing_settings_to_erase; for (auto &setting : settings) { - if (MetricsUtils::IsOptimizerMetric(setting) || MetricsUtils::IsPhaseTimingMetric(setting)) { + if (MetricsUtils::IsOptimizerMetric(setting) || MetricsUtils::IsPhaseTimingMetric(setting) || + setting == MetricsType::BLOCKED_THREAD_TIME) { phase_timing_settings_to_erase.insert(setting); } } @@ -711,7 +712,7 @@ unique_ptr QueryProfiler::CreateTree(const PhysicalOperator &root info = ProfilingInfo(settings, depth); auto child_settings = settings; if (depth == 0) { - child_settings = ErasePhaseTimingSettings(child_settings); + child_settings = EraseQueryRootSettings(child_settings); } node->depth = depth; diff --git a/src/main/settings/settings.cpp b/src/main/settings/settings.cpp index 182daee5bdb2..8cdcf42efd2f 100644 --- a/src/main/settings/settings.cpp +++ b/src/main/settings/settings.cpp @@ -13,6 +13,7 @@ #include "duckdb/parallel/task_scheduler.hpp" #include "duckdb/parser/parser.hpp" #include "duckdb/planner/expression_binder.hpp" +#include "duckdb/storage/buffer/buffer_pool.hpp" #include "duckdb/storage/buffer_manager.hpp" #include "duckdb/storage/storage_manager.hpp" @@ -1343,6 +1344,22 @@ Value MaximumTempDirectorySize::GetSetting(const ClientContext &context) { } } +//===--------------------------------------------------------------------===// +// Maximum Vacuum Size +//===--------------------------------------------------------------------===// +void MaximumVacuumTasks::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) { + config.options.max_vacuum_tasks = input.GetValue(); +} + +void MaximumVacuumTasks::ResetGlobal(DatabaseInstance *db, DBConfig &config) { + config.options.max_vacuum_tasks = DBConfig().options.max_vacuum_tasks; +} + +Value MaximumVacuumTasks::GetSetting(const ClientContext &context) { + auto &config = DBConfig::GetConfig(context); + return Value::UBIGINT(config.options.max_vacuum_tasks); +} + //===--------------------------------------------------------------------===// // Merge Join Threshold //===--------------------------------------------------------------------===// @@ -1891,27 +1908,52 @@ Value UsernameSetting::GetSetting(const ClientContext &context) { //===--------------------------------------------------------------------===// // Allocator Flush Threshold //===--------------------------------------------------------------------===// -void FlushAllocatorSetting::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) { +void AllocatorFlushThreshold::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) { config.options.allocator_flush_threshold = DBConfig::ParseMemoryLimit(input.ToString()); if (db) { TaskScheduler::GetScheduler(*db).SetAllocatorFlushTreshold(config.options.allocator_flush_threshold); } } -void FlushAllocatorSetting::ResetGlobal(DatabaseInstance *db, DBConfig &config) { +void AllocatorFlushThreshold::ResetGlobal(DatabaseInstance *db, DBConfig &config) { config.options.allocator_flush_threshold = DBConfig().options.allocator_flush_threshold; if (db) { TaskScheduler::GetScheduler(*db).SetAllocatorFlushTreshold(config.options.allocator_flush_threshold); } } -Value FlushAllocatorSetting::GetSetting(const ClientContext &context) { +Value AllocatorFlushThreshold::GetSetting(const ClientContext &context) { auto &config = DBConfig::GetConfig(context); return Value(StringUtil::BytesToHumanReadableString(config.options.allocator_flush_threshold)); } //===--------------------------------------------------------------------===// -// Allocator Background Thread +// Allocator Bulk Deallocation Flush Threshold +//===--------------------------------------------------------------------===// +void AllocatorBulkDeallocationFlushThreshold::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) { + config.options.allocator_bulk_deallocation_flush_threshold = DBConfig::ParseMemoryLimit(input.ToString()); + if (db) { + BufferManager::GetBufferManager(*db).GetBufferPool().SetAllocatorBulkDeallocationFlushThreshold( + config.options.allocator_bulk_deallocation_flush_threshold); + } +} + +void AllocatorBulkDeallocationFlushThreshold::ResetGlobal(DatabaseInstance *db, DBConfig &config) { + config.options.allocator_bulk_deallocation_flush_threshold = + DBConfig().options.allocator_bulk_deallocation_flush_threshold; + if (db) { + BufferManager::GetBufferManager(*db).GetBufferPool().SetAllocatorBulkDeallocationFlushThreshold( + config.options.allocator_bulk_deallocation_flush_threshold); + } +} + +Value AllocatorBulkDeallocationFlushThreshold::GetSetting(const ClientContext &context) { + auto &config = DBConfig::GetConfig(context); + return Value(StringUtil::BytesToHumanReadableString(config.options.allocator_bulk_deallocation_flush_threshold)); +} + +//===--------------------------------------------------------------------===// +// Allocator Background Threads //===--------------------------------------------------------------------===// void AllocatorBackgroundThreadsSetting::SetGlobal(DatabaseInstance *db, DBConfig &config, const Value &input) { config.options.allocator_background_threads = input.GetValue(); diff --git a/src/parallel/executor.cpp b/src/parallel/executor.cpp index 630b89c9313a..9655c33acf9c 100644 --- a/src/parallel/executor.cpp +++ b/src/parallel/executor.cpp @@ -2,6 +2,7 @@ #include "duckdb/execution/execution_context.hpp" #include "duckdb/execution/operator/helper/physical_result_collector.hpp" +#include "duckdb/execution/operator/scan/physical_table_scan.hpp" #include "duckdb/execution/operator/set/physical_cte.hpp" #include "duckdb/execution/operator/set/physical_recursive_cte.hpp" #include "duckdb/execution/physical_operator.hpp" @@ -162,11 +163,14 @@ void Executor::SchedulePipeline(const shared_ptr &meta_pipeline, S event_map.insert(make_pair(reference(*base_pipeline), base_stack)); for (auto &pipeline : pipelines) { - auto &config = DBConfig::GetConfig(context); auto source = pipeline->GetSource(); - if (source->type == PhysicalOperatorType::TABLE_SCAN && config.options.initialize_in_main_thread) { - // this is a work-around for the R client that requires the init to be called in the main thread - pipeline->ResetSource(true); + if (source->type == PhysicalOperatorType::TABLE_SCAN) { + auto &table_function = source->Cast(); + if (table_function.function.global_initialization == TableFunctionInitialization::INITIALIZE_ON_SCHEDULE) { + // certain functions have to be eagerly initialized during scheduling + // if that is the case - initialize the function here + pipeline->ResetSource(true); + } } } } diff --git a/src/parser/parsed_data/create_index_info.cpp b/src/parser/parsed_data/create_index_info.cpp index 3d41ea4fea2a..01e2840fad40 100644 --- a/src/parser/parsed_data/create_index_info.cpp +++ b/src/parser/parsed_data/create_index_info.cpp @@ -8,9 +8,9 @@ CreateIndexInfo::CreateIndexInfo() : CreateInfo(CatalogType::INDEX_ENTRY, INVALI } CreateIndexInfo::CreateIndexInfo(const duckdb::CreateIndexInfo &info) - : CreateInfo(CatalogType::INDEX_ENTRY), table(info.table), index_name(info.index_name), options(info.options), - index_type(info.index_type), constraint_type(info.constraint_type), column_ids(info.column_ids), - scan_types(info.scan_types), names(info.names) { + : CreateInfo(CatalogType::INDEX_ENTRY, info.schema), table(info.table), index_name(info.index_name), + options(info.options), index_type(info.index_type), constraint_type(info.constraint_type), + column_ids(info.column_ids), scan_types(info.scan_types), names(info.names) { } static void RemoveTableQualificationRecursive(unique_ptr &expr, const string &table_name) { diff --git a/src/parser/transform/expression/transform_subquery.cpp b/src/parser/transform/expression/transform_subquery.cpp index 2a06d5907e92..9d93b54851ee 100644 --- a/src/parser/transform/expression/transform_subquery.cpp +++ b/src/parser/transform/expression/transform_subquery.cpp @@ -24,7 +24,7 @@ void RemoveOrderQualificationRecursive(unique_ptr &expr) { unique_ptr Transformer::TransformSubquery(duckdb_libpgquery::PGSubLink &root) { auto subquery_expr = make_uniq(); - subquery_expr->subquery = TransformSelect(root.subselect); + subquery_expr->subquery = TransformSelectStmt(*root.subselect); SetQueryLocation(*subquery_expr, root.location); D_ASSERT(subquery_expr->subquery); D_ASSERT(!subquery_expr->subquery->node->GetSelectList().empty()); diff --git a/src/parser/transform/helpers/transform_cte.cpp b/src/parser/transform/helpers/transform_cte.cpp index 0e216471976e..a344d730e1a3 100644 --- a/src/parser/transform/helpers/transform_cte.cpp +++ b/src/parser/transform/helpers/transform_cte.cpp @@ -70,8 +70,7 @@ void Transformer::TransformCTE(duckdb_libpgquery::PGWithClause &de_with_clause, info->query = TransformRecursiveCTE(cte, *info); } else { Transformer cte_transformer(*this); - info->query = - cte_transformer.TransformSelect(*PGPointerCast(cte.ctequery)); + info->query = cte_transformer.TransformSelectStmt(*cte.ctequery); } D_ASSERT(info->query); auto cte_name = string(cte.ctename); @@ -114,16 +113,20 @@ unique_ptr Transformer::TransformRecursiveCTE(duckdb_libpgquery auto with_clause = PGPointerCast(stmt.withClause); TransformCTE(*with_clause, result.cte_map); } - result.left = TransformSelectNode(*PGPointerCast(stmt.larg)); - result.right = TransformSelectNode(*PGPointerCast(stmt.rarg)); + result.left = TransformSelectNode(*stmt.larg); + result.right = TransformSelectNode(*stmt.rarg); result.aliases = info.aliases; break; } case duckdb_libpgquery::PG_SETOP_EXCEPT: case duckdb_libpgquery::PG_SETOP_INTERSECT: - default: + default: { // This CTE is not recursive. Fallback to regular query transformation. - return TransformSelect(*PGPointerCast(cte.ctequery)); + auto node = TransformSelectNode(*cte.ctequery); + auto result = make_uniq(); + result->node = std::move(node); + return result; + } } if (stmt.limitCount || stmt.limitOffset) { diff --git a/src/parser/transform/statement/transform_copy.cpp b/src/parser/transform/statement/transform_copy.cpp index 5da207ddcac2..1fa2464f55be 100644 --- a/src/parser/transform/statement/transform_copy.cpp +++ b/src/parser/transform/statement/transform_copy.cpp @@ -117,7 +117,7 @@ unique_ptr Transformer::TransformCopy(duckdb_libpgquery::PGCopySt info.schema = table.schema_name; info.catalog = table.catalog_name; } else { - info.select_statement = TransformSelectNode(*PGPointerCast(stmt.query)); + info.select_statement = TransformSelectNode(*stmt.query); } // handle the different options of the COPY statement diff --git a/src/parser/transform/statement/transform_create_function.cpp b/src/parser/transform/statement/transform_create_function.cpp index fda65d8b1008..82917ee25d88 100644 --- a/src/parser/transform/statement/transform_create_function.cpp +++ b/src/parser/transform/statement/transform_create_function.cpp @@ -13,8 +13,7 @@ unique_ptr Transformer::TransformMacroFunction(duckdb_libpgquery: auto expression = TransformExpression(def.function); macro_func = make_uniq(std::move(expression)); } else if (def.query) { - auto query_node = - TransformSelect(*PGPointerCast(def.query), true)->node->Copy(); + auto query_node = TransformSelectNode(*def.query); macro_func = make_uniq(std::move(query_node)); } diff --git a/src/parser/transform/statement/transform_create_table_as.cpp b/src/parser/transform/statement/transform_create_table_as.cpp index 0c3406c27a92..6af7fe4e7373 100644 --- a/src/parser/transform/statement/transform_create_table_as.cpp +++ b/src/parser/transform/statement/transform_create_table_as.cpp @@ -15,7 +15,7 @@ unique_ptr Transformer::TransformCreateTableAs(duckdb_libpgquer if (stmt.query->type != duckdb_libpgquery::T_PGSelectStmt) { throw ParserException("CREATE TABLE AS requires a SELECT clause"); } - auto query = TransformSelect(stmt.query, false); + auto query = TransformSelectStmt(*stmt.query, false); auto result = make_uniq(); auto info = make_uniq(); diff --git a/src/parser/transform/statement/transform_create_type.cpp b/src/parser/transform/statement/transform_create_type.cpp index 3235ed3b3917..3b6753b6bf54 100644 --- a/src/parser/transform/statement/transform_create_type.cpp +++ b/src/parser/transform/statement/transform_create_type.cpp @@ -49,7 +49,7 @@ unique_ptr Transformer::TransformCreateType(duckdb_libpgquery:: if (stmt.query) { // CREATE TYPE mood AS ENUM (SELECT ...) D_ASSERT(stmt.vals == nullptr); - auto query = TransformSelect(stmt.query, false); + auto query = TransformSelectStmt(*stmt.query, false); info->query = std::move(query); info->type = LogicalType::INVALID; } else { diff --git a/src/parser/transform/statement/transform_create_view.cpp b/src/parser/transform/statement/transform_create_view.cpp index 0f3cac0031e3..8504ecfc9a91 100644 --- a/src/parser/transform/statement/transform_create_view.cpp +++ b/src/parser/transform/statement/transform_create_view.cpp @@ -21,7 +21,7 @@ unique_ptr Transformer::TransformCreateView(duckdb_libpgquery:: } info->on_conflict = TransformOnConflict(stmt.onconflict); - info->query = TransformSelect(*PGPointerCast(stmt.query), false); + info->query = TransformSelectStmt(*PGPointerCast(stmt.query), false); PivotEntryCheck("view"); diff --git a/src/parser/transform/statement/transform_insert.cpp b/src/parser/transform/statement/transform_insert.cpp index dfa3c25012c9..57c895cf7e8b 100644 --- a/src/parser/transform/statement/transform_insert.cpp +++ b/src/parser/transform/statement/transform_insert.cpp @@ -41,7 +41,7 @@ unique_ptr Transformer::TransformInsert(duckdb_libpgquery::PGIn TransformExpressionList(*stmt.returningList, result->returning_list); } if (stmt.selectStmt) { - result->select_statement = TransformSelect(stmt.selectStmt, false); + result->select_statement = TransformSelectStmt(*stmt.selectStmt, false); } else { result->default_values = true; } diff --git a/src/parser/transform/statement/transform_select.cpp b/src/parser/transform/statement/transform_select.cpp index 7b85d1f5d59b..2e5135ef640d 100644 --- a/src/parser/transform/statement/transform_select.cpp +++ b/src/parser/transform/statement/transform_select.cpp @@ -4,20 +4,19 @@ namespace duckdb { -unique_ptr Transformer::TransformSelectNode(duckdb_libpgquery::PGSelectStmt &select) { - unique_ptr stmt = nullptr; - if (select.pivot) { - stmt = TransformPivotStatement(select); - } else { - stmt = TransformSelectInternal(select); +unique_ptr Transformer::TransformSelectNode(duckdb_libpgquery::PGNode &node, bool is_select) { + switch (node.type) { + case duckdb_libpgquery::T_PGVariableShowSelectStmt: + return TransformShowSelect(PGCast(node)); + case duckdb_libpgquery::T_PGVariableShowStmt: + return TransformShow(PGCast(node)); + default: + return TransformSelectNodeInternal(PGCast(node), is_select); } - - return TransformMaterializedCTE(std::move(stmt)); } -unique_ptr Transformer::TransformSelect(duckdb_libpgquery::PGSelectStmt &select, bool is_select) { - auto result = make_uniq(); - +unique_ptr Transformer::TransformSelectNodeInternal(duckdb_libpgquery::PGSelectStmt &select, + bool is_select) { // Both Insert/Create Table As uses this. if (is_select) { if (select.intoClause) { @@ -27,20 +26,26 @@ unique_ptr Transformer::TransformSelect(duckdb_libpgquery::PGSe throw ParserException("SELECT locking clause is not supported!"); } } + unique_ptr stmt = nullptr; + if (select.pivot) { + stmt = TransformPivotStatement(select); + } else { + stmt = TransformSelectInternal(select); + } + return TransformMaterializedCTE(std::move(stmt)); +} - result->node = TransformSelectNode(select); +unique_ptr Transformer::TransformSelectStmt(duckdb_libpgquery::PGSelectStmt &select, bool is_select) { + auto result = make_uniq(); + result->node = TransformSelectNodeInternal(select, is_select); return result; } -unique_ptr Transformer::TransformSelect(optional_ptr node, bool is_select) { - switch (node->type) { - case duckdb_libpgquery::T_PGVariableShowSelectStmt: - return TransformShowSelect(PGCast(*node)); - case duckdb_libpgquery::T_PGVariableShowStmt: - return TransformShow(PGCast(*node)); - default: - return TransformSelect(PGCast(*node), is_select); - } +unique_ptr Transformer::TransformSelectStmt(duckdb_libpgquery::PGNode &node, bool is_select) { + auto select_node = TransformSelectNode(node, is_select); + auto select_statement = make_uniq(); + select_statement->node = std::move(select_node); + return select_statement; } } // namespace duckdb diff --git a/src/parser/transform/statement/transform_show.cpp b/src/parser/transform/statement/transform_show.cpp index c80ea92a24f1..12e6d6860d33 100644 --- a/src/parser/transform/statement/transform_show.cpp +++ b/src/parser/transform/statement/transform_show.cpp @@ -9,7 +9,7 @@ namespace duckdb { -unique_ptr Transformer::TransformShow(duckdb_libpgquery::PGVariableShowStmt &stmt) { +unique_ptr Transformer::TransformShow(duckdb_libpgquery::PGVariableShowStmt &stmt) { string name = stmt.name; auto select_node = make_uniq(); @@ -18,9 +18,12 @@ unique_ptr Transformer::TransformShow(duckdb_libpgquery::PGVari showref->table_name = std::move(name); showref->show_type = stmt.is_summary ? ShowType::SUMMARY : ShowType::DESCRIBE; select_node->from_table = std::move(showref); + return std::move(select_node); +} +unique_ptr Transformer::TransformShowStmt(duckdb_libpgquery::PGVariableShowStmt &stmt) { auto result = make_uniq(); - result->node = std::move(select_node); + result->node = TransformShow(stmt); return result; } diff --git a/src/parser/transform/statement/transform_show_select.cpp b/src/parser/transform/statement/transform_show_select.cpp index 78e19d175842..bd6f52a87492 100644 --- a/src/parser/transform/statement/transform_show_select.cpp +++ b/src/parser/transform/statement/transform_show_select.cpp @@ -7,19 +7,21 @@ namespace duckdb { -unique_ptr Transformer::TransformShowSelect(duckdb_libpgquery::PGVariableShowSelectStmt &stmt) { +unique_ptr Transformer::TransformShowSelect(duckdb_libpgquery::PGVariableShowSelectStmt &stmt) { // we capture the select statement of SHOW auto select_node = make_uniq(); select_node->select_list.push_back(make_uniq()); auto show_ref = make_uniq(); show_ref->show_type = stmt.is_summary ? ShowType::SUMMARY : ShowType::DESCRIBE; - auto select = TransformSelect(stmt.stmt); - show_ref->query = std::move(select->node); + show_ref->query = TransformSelectNode(*stmt.stmt); select_node->from_table = std::move(show_ref); + return std::move(select_node); +} +unique_ptr Transformer::TransformShowSelectStmt(duckdb_libpgquery::PGVariableShowSelectStmt &stmt) { auto result = make_uniq(); - result->node = std::move(select_node); + result->node = TransformShowSelect(stmt); return result; } diff --git a/src/parser/transform/tableref/transform_pivot.cpp b/src/parser/transform/tableref/transform_pivot.cpp index 1223994de7a6..62ca0e1b7a68 100644 --- a/src/parser/transform/tableref/transform_pivot.cpp +++ b/src/parser/transform/tableref/transform_pivot.cpp @@ -79,7 +79,7 @@ PivotColumn Transformer::TransformPivotColumn(duckdb_libpgquery::PGPivot &pivot, } } if (pivot.subquery) { - col.subquery = TransformSelectNode(*PGPointerCast(pivot.subquery)); + col.subquery = TransformSelectNode(*pivot.subquery); } if (pivot.pivot_enum) { col.pivot_enum = pivot.pivot_enum; diff --git a/src/parser/transform/tableref/transform_subquery.cpp b/src/parser/transform/tableref/transform_subquery.cpp index 1d5105adf725..58c61811c3fc 100644 --- a/src/parser/transform/tableref/transform_subquery.cpp +++ b/src/parser/transform/tableref/transform_subquery.cpp @@ -5,7 +5,7 @@ namespace duckdb { unique_ptr Transformer::TransformRangeSubselect(duckdb_libpgquery::PGRangeSubselect &root) { Transformer subquery_transformer(*this); - auto subquery = subquery_transformer.TransformSelect(root.subquery); + auto subquery = subquery_transformer.TransformSelectStmt(*root.subquery); if (!subquery) { return nullptr; } diff --git a/src/parser/transformer.cpp b/src/parser/transformer.cpp index 6a30a9d334e1..123a3b5f1e66 100644 --- a/src/parser/transformer.cpp +++ b/src/parser/transformer.cpp @@ -144,7 +144,7 @@ unique_ptr Transformer::TransformStatementInternal(duckdb_libpgque return result; } case duckdb_libpgquery::T_PGSelectStmt: - return TransformSelect(PGCast(stmt)); + return TransformSelectStmt(PGCast(stmt)); case duckdb_libpgquery::T_PGCreateStmt: return TransformCreateTable(PGCast(stmt)); case duckdb_libpgquery::T_PGCreateSchemaStmt: @@ -194,9 +194,9 @@ unique_ptr Transformer::TransformStatementInternal(duckdb_libpgque case duckdb_libpgquery::T_PGVacuumStmt: return TransformVacuum(PGCast(stmt)); case duckdb_libpgquery::T_PGVariableShowStmt: - return TransformShow(PGCast(stmt)); + return TransformShowStmt(PGCast(stmt)); case duckdb_libpgquery::T_PGVariableShowSelectStmt: - return TransformShowSelect(PGCast(stmt)); + return TransformShowSelectStmt(PGCast(stmt)); case duckdb_libpgquery::T_PGCallStmt: return TransformCall(PGCast(stmt)); case duckdb_libpgquery::T_PGVariableSetStmt: diff --git a/src/planner/binder/expression/bind_macro_expression.cpp b/src/planner/binder/expression/bind_macro_expression.cpp index ffb9a72da2a5..caa6ba777da4 100644 --- a/src/planner/binder/expression/bind_macro_expression.cpp +++ b/src/planner/binder/expression/bind_macro_expression.cpp @@ -99,8 +99,8 @@ void ExpressionBinder::ReplaceMacroParameters(unique_ptr &expr *expr, [&](unique_ptr &child) { ReplaceMacroParameters(child, lambda_params); }); } -BindResult ExpressionBinder::BindMacro(FunctionExpression &function, ScalarMacroCatalogEntry ¯o_func, idx_t depth, - unique_ptr &expr) { +void ExpressionBinder::UnfoldMacroExpression(FunctionExpression &function, ScalarMacroCatalogEntry ¯o_func, + unique_ptr &expr) { // validate the arguments and separate positional and default arguments vector> positionals; unordered_map> defaults; @@ -143,6 +143,14 @@ BindResult ExpressionBinder::BindMacro(FunctionExpression &function, ScalarMacro // now replace the parameters vector> lambda_params; ReplaceMacroParameters(expr, lambda_params); +} + +BindResult ExpressionBinder::BindMacro(FunctionExpression &function, ScalarMacroCatalogEntry ¯o_func, idx_t depth, + unique_ptr &expr) { + auto stack_checker = StackCheck(*expr, 3); + + // unfold the macro expression + UnfoldMacroExpression(function, macro_func, expr); // bind the unfolded macro return BindExpression(expr, depth); diff --git a/src/planner/binder/statement/bind_create.cpp b/src/planner/binder/statement/bind_create.cpp index c461713ebf4a..f03f260c4889 100644 --- a/src/planner/binder/statement/bind_create.cpp +++ b/src/planner/binder/statement/bind_create.cpp @@ -588,6 +588,7 @@ unique_ptr DuckCatalog::BindCreateIndex(Binder &binder, CreateS create_index_info->scan_types.emplace_back(LogicalType::ROW_TYPE); create_index_info->names = get.names; create_index_info->column_ids = column_ids; + create_index_info->schema = table.schema.name; auto &bind_data = get.bind_data->Cast(); bind_data.is_create_index = true; get.AddColumnId(COLUMN_IDENTIFIER_ROW_ID); diff --git a/src/planner/binder/statement/bind_insert.cpp b/src/planner/binder/statement/bind_insert.cpp index ac430edee321..f02be80009d1 100644 --- a/src/planner/binder/statement/bind_insert.cpp +++ b/src/planner/binder/statement/bind_insert.cpp @@ -295,7 +295,7 @@ void Binder::BindOnConflictClause(LogicalInsert &insert, TableCatalogEntry &tabl auto entry = specified_columns.find(col.Name()); if (entry != specified_columns.end()) { // column was specified, set to the index - insert.on_conflict_filter.insert(col.Oid()); + insert.on_conflict_filter.insert(col.Physical().index); } } bool index_references_columns = false; @@ -353,8 +353,12 @@ void Binder::BindOnConflictClause(LogicalInsert &insert, TableCatalogEntry &tabl // add a bind context entry for it auto excluded_index = GenerateTableIndex(); insert.excluded_table_index = excluded_index; - auto table_column_names = columns.GetColumnNames(); - auto table_column_types = columns.GetColumnTypes(); + vector table_column_names; + vector table_column_types; + for (auto &col : columns.Physical()) { + table_column_names.push_back(col.Name()); + table_column_types.push_back(col.Type()); + } bind_context.AddGenericBinding(excluded_index, "excluded", table_column_names, table_column_types); if (on_conflict.condition) { diff --git a/src/planner/expression_binder.cpp b/src/planner/expression_binder.cpp index e26d04aeba7e..146f1790b097 100644 --- a/src/planner/expression_binder.cpp +++ b/src/planner/expression_binder.cpp @@ -7,6 +7,7 @@ #include "duckdb/planner/expression/list.hpp" #include "duckdb/planner/expression_iterator.hpp" #include "duckdb/common/operator/cast_operators.hpp" +#include "duckdb/main/client_config.hpp" namespace duckdb { @@ -36,18 +37,21 @@ ExpressionBinder::~ExpressionBinder() { } void ExpressionBinder::InitializeStackCheck() { + static constexpr idx_t INITIAL_DEPTH = 5; if (binder.HasActiveBinder()) { - stack_depth = binder.GetActiveBinder().stack_depth; + stack_depth = binder.GetActiveBinder().stack_depth + INITIAL_DEPTH; } else { - stack_depth = 0; + stack_depth = INITIAL_DEPTH; } } StackChecker ExpressionBinder::StackCheck(const ParsedExpression &expr, idx_t extra_stack) { D_ASSERT(stack_depth != DConstants::INVALID_INDEX); - if (stack_depth + extra_stack >= MAXIMUM_STACK_DEPTH) { - throw BinderException("Maximum recursion depth exceeded (Maximum: %llu) while binding \"%s\"", - MAXIMUM_STACK_DEPTH, expr.ToString()); + auto &options = ClientConfig::GetConfig(context); + if (stack_depth + extra_stack >= options.max_expression_depth) { + throw BinderException("Max expression depth limit of %lld exceeded. Use \"SET max_expression_depth TO x\" to " + "increase the maximum expression depth.", + options.max_expression_depth); } return StackChecker(*this, extra_stack); } diff --git a/src/planner/logical_operator.cpp b/src/planner/logical_operator.cpp index 2b704dfebeef..ba8ae24a4b89 100644 --- a/src/planner/logical_operator.cpp +++ b/src/planner/logical_operator.cpp @@ -30,6 +30,17 @@ vector LogicalOperator::GetColumnBindings() { return {ColumnBinding(0, 0)}; } +void LogicalOperator::SetParamsEstimatedCardinality(InsertionOrderPreservingMap &result) const { + if (has_estimated_cardinality) { + result[RenderTreeNode::ESTIMATED_CARDINALITY] = StringUtil::Format("%llu", estimated_cardinality); + } +} + +void LogicalOperator::SetEstimatedCardinality(idx_t _estimated_cardinality) { + estimated_cardinality = _estimated_cardinality; + has_estimated_cardinality = true; +} + // LCOV_EXCL_START string LogicalOperator::ColumnBindingsToString(const vector &bindings) { string result = "{"; @@ -61,6 +72,7 @@ InsertionOrderPreservingMap LogicalOperator::ParamsToString() const { expressions_info += expressions[i]->GetName(); } result["Expressions"] = expressions_info; + SetParamsEstimatedCardinality(result); return result; } @@ -220,11 +232,6 @@ idx_t LogicalOperator::EstimateCardinality(ClientContext &context) { return estimated_cardinality; } -void LogicalOperator::SetEstimatedCardinality(idx_t _estimated_cardinality) { - estimated_cardinality = _estimated_cardinality; - has_estimated_cardinality = true; -} - void LogicalOperator::Print() { Printer::Print(ToString()); } diff --git a/src/planner/operator/logical_aggregate.cpp b/src/planner/operator/logical_aggregate.cpp index beb17c5bedfd..3ee244c22f1f 100644 --- a/src/planner/operator/logical_aggregate.cpp +++ b/src/planner/operator/logical_aggregate.cpp @@ -59,6 +59,7 @@ InsertionOrderPreservingMap LogicalAggregate::ParamsToString() const { expressions_info += expressions[i]->GetName(); } result["Expressions"] = expressions_info; + SetParamsEstimatedCardinality(result); return result; } diff --git a/src/planner/operator/logical_any_join.cpp b/src/planner/operator/logical_any_join.cpp index d5c0f7a69f5d..07587dad9093 100644 --- a/src/planner/operator/logical_any_join.cpp +++ b/src/planner/operator/logical_any_join.cpp @@ -8,6 +8,7 @@ LogicalAnyJoin::LogicalAnyJoin(JoinType type) : LogicalJoin(type, LogicalOperato InsertionOrderPreservingMap LogicalAnyJoin::ParamsToString() const { InsertionOrderPreservingMap result; result["Condition"] = condition->ToString(); + SetParamsEstimatedCardinality(result); return result; } diff --git a/src/planner/operator/logical_comparison_join.cpp b/src/planner/operator/logical_comparison_join.cpp index 66c6afb88cc9..48efe6259d78 100644 --- a/src/planner/operator/logical_comparison_join.cpp +++ b/src/planner/operator/logical_comparison_join.cpp @@ -23,6 +23,8 @@ InsertionOrderPreservingMap LogicalComparisonJoin::ParamsToString() cons conditions_info += expr->ToString(); } result["Conditions"] = conditions_info; + SetParamsEstimatedCardinality(result); + return result; } diff --git a/src/planner/operator/logical_cteref.cpp b/src/planner/operator/logical_cteref.cpp index 28082aa7d650..e82b1448a571 100644 --- a/src/planner/operator/logical_cteref.cpp +++ b/src/planner/operator/logical_cteref.cpp @@ -7,6 +7,7 @@ namespace duckdb { InsertionOrderPreservingMap LogicalCTERef::ParamsToString() const { InsertionOrderPreservingMap result; result["CTE Index"] = StringUtil::Format("%llu", cte_index); + SetParamsEstimatedCardinality(result); return result; } diff --git a/src/planner/operator/logical_distinct.cpp b/src/planner/operator/logical_distinct.cpp index fb87b66ee927..f6983b5e811c 100644 --- a/src/planner/operator/logical_distinct.cpp +++ b/src/planner/operator/logical_distinct.cpp @@ -18,6 +18,7 @@ InsertionOrderPreservingMap LogicalDistinct::ParamsToString() const { StringUtil::Join(distinct_targets, distinct_targets.size(), "\n", [](const unique_ptr &child) { return child->GetName(); }); } + SetParamsEstimatedCardinality(result); return result; } diff --git a/src/planner/operator/logical_get.cpp b/src/planner/operator/logical_get.cpp index 1ed160a7cd9b..d6e9fda93819 100644 --- a/src/planner/operator/logical_get.cpp +++ b/src/planner/operator/logical_get.cpp @@ -59,6 +59,7 @@ InsertionOrderPreservingMap LogicalGet::ParamsToString() const { if (function.to_string) { result["__text__"] = function.to_string(bind_data.get()); } + SetParamsEstimatedCardinality(result); return result; } diff --git a/src/planner/operator/logical_materialized_cte.cpp b/src/planner/operator/logical_materialized_cte.cpp index 3b2dc54e8046..043695bbd17b 100644 --- a/src/planner/operator/logical_materialized_cte.cpp +++ b/src/planner/operator/logical_materialized_cte.cpp @@ -5,6 +5,7 @@ namespace duckdb { InsertionOrderPreservingMap LogicalMaterializedCTE::ParamsToString() const { InsertionOrderPreservingMap result; result["Table Index"] = StringUtil::Format("%llu", table_index); + SetParamsEstimatedCardinality(result); return result; } diff --git a/src/planner/operator/logical_order.cpp b/src/planner/operator/logical_order.cpp index 5d98dda9f278..bf904c3f3983 100644 --- a/src/planner/operator/logical_order.cpp +++ b/src/planner/operator/logical_order.cpp @@ -24,6 +24,7 @@ InsertionOrderPreservingMap LogicalOrder::ParamsToString() const { orders_info += orders[i].expression->GetName(); } result["__order_by__"] = orders_info; + SetParamsEstimatedCardinality(result); return result; } diff --git a/src/storage/buffer/buffer_pool.cpp b/src/storage/buffer/buffer_pool.cpp index c95abbc7b323..76382fdf3b2c 100644 --- a/src/storage/buffer/buffer_pool.cpp +++ b/src/storage/buffer/buffer_pool.cpp @@ -309,6 +309,8 @@ BufferPool::EvictionResult BufferPool::EvictBlocksInternal(EvictionQueue &queue, if (!found) { r.Resize(0); + } else if (Allocator::SupportsFlush() && extra_memory > allocator_bulk_deallocation_flush_threshold) { + Allocator::FlushAll(); } return {found, std::move(r)}; @@ -401,6 +403,10 @@ void BufferPool::SetLimit(idx_t limit, const char *exception_postscript) { } } +void BufferPool::SetAllocatorBulkDeallocationFlushThreshold(idx_t threshold) { + allocator_bulk_deallocation_flush_threshold = threshold; +} + BufferPool::MemoryUsage::MemoryUsage() { for (auto &v : memory_usage) { v = 0; diff --git a/src/storage/checkpoint/table_data_writer.cpp b/src/storage/checkpoint/table_data_writer.cpp index aedc701d5df6..5a134c3c6e31 100644 --- a/src/storage/checkpoint/table_data_writer.cpp +++ b/src/storage/checkpoint/table_data_writer.cpp @@ -31,7 +31,11 @@ void TableDataWriter::AddRowGroup(RowGroupPointer &&row_group_pointer, unique_pt } TaskScheduler &TableDataWriter::GetScheduler() { - return TaskScheduler::GetScheduler(table.ParentCatalog().GetDatabase()); + return TaskScheduler::GetScheduler(GetDatabase()); +} + +DatabaseInstance &TableDataWriter::GetDatabase() { + return table.ParentCatalog().GetDatabase(); } SingleFileTableDataWriter::SingleFileTableDataWriter(SingleFileCheckpointWriter &checkpoint_manager, diff --git a/src/storage/checkpoint/write_overflow_strings_to_disk.cpp b/src/storage/checkpoint/write_overflow_strings_to_disk.cpp index 5410e53caaf5..c58be310271c 100644 --- a/src/storage/checkpoint/write_overflow_strings_to_disk.cpp +++ b/src/storage/checkpoint/write_overflow_strings_to_disk.cpp @@ -87,6 +87,8 @@ void WriteOverflowStringsToDisk::Flush() { // write to disk auto &block_manager = partial_block_manager.GetBlockManager(); block_manager.Write(handle.GetFileBuffer(), block_id); + + auto lock = partial_block_manager.GetLock(); partial_block_manager.AddWrittenBlock(block_id); } block_id = INVALID_BLOCK; diff --git a/src/storage/data_table.cpp b/src/storage/data_table.cpp index aa683d12fc1f..3e7b63da24d1 100644 --- a/src/storage/data_table.cpp +++ b/src/storage/data_table.cpp @@ -225,13 +225,16 @@ TableIOManager &TableIOManager::Get(DataTable &table) { //===--------------------------------------------------------------------===// void DataTable::InitializeScan(TableScanState &state, const vector &column_ids, TableFilterSet *table_filters) { - state.checkpoint_lock = info->checkpoint_lock.GetSharedLock(); + if (!state.checkpoint_lock) { + state.checkpoint_lock = make_shared_ptr(info->checkpoint_lock.GetSharedLock()); + } state.Initialize(column_ids, table_filters); row_groups->InitializeScan(state.table_state, column_ids, table_filters); } void DataTable::InitializeScan(DuckTransaction &transaction, TableScanState &state, const vector &column_ids, TableFilterSet *table_filters) { + state.checkpoint_lock = transaction.SharedLockTable(*info); auto &local_storage = LocalStorage::Get(transaction); InitializeScan(state, column_ids, table_filters); local_storage.InitializeScan(*this, state.local_state, table_filters); @@ -239,7 +242,9 @@ void DataTable::InitializeScan(DuckTransaction &transaction, TableScanState &sta void DataTable::InitializeScanWithOffset(TableScanState &state, const vector &column_ids, idx_t start_row, idx_t end_row) { - state.checkpoint_lock = info->checkpoint_lock.GetSharedLock(); + if (!state.checkpoint_lock) { + state.checkpoint_lock = make_shared_ptr(info->checkpoint_lock.GetSharedLock()); + } state.Initialize(column_ids); row_groups->InitializeScanWithOffset(state.table_state, column_ids, start_row, end_row); } @@ -255,7 +260,8 @@ idx_t DataTable::MaxThreads(ClientContext &context) { void DataTable::InitializeParallelScan(ClientContext &context, ParallelTableScanState &state) { auto &local_storage = LocalStorage::Get(context, db); - state.checkpoint_lock = info->checkpoint_lock.GetSharedLock(); + auto &transaction = DuckTransaction::Get(context, db); + state.checkpoint_lock = transaction.SharedLockTable(*info); row_groups->InitializeParallelScan(state.scan_state); local_storage.InitializeParallelScan(*this, state.local_state); diff --git a/src/storage/serialization/serialize_nodes.cpp b/src/storage/serialization/serialize_nodes.cpp index 42be4f6e2cae..872e3a5b8858 100644 --- a/src/storage/serialization/serialize_nodes.cpp +++ b/src/storage/serialization/serialize_nodes.cpp @@ -203,7 +203,7 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault>(134, "sql_type_list", sql_type_list); serializer.WritePropertyWithDefault>(135, "sql_types_per_column", sql_types_per_column); serializer.WritePropertyWithDefault(136, "columns_set", columns_set, false); - serializer.WritePropertyWithDefault>(137, "dialect_options.state_machine_options.comment", dialect_options.state_machine_options.comment, CSVOption()); + serializer.WritePropertyWithDefault>(137, "dialect_options.state_machine_options.comment", dialect_options.state_machine_options.comment, CSVOption('\0')); serializer.WritePropertyWithDefault(138, "dialect_options.rows_until_header", dialect_options.rows_until_header); } @@ -246,7 +246,7 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { deserializer.ReadPropertyWithDefault>(134, "sql_type_list", result.sql_type_list); deserializer.ReadPropertyWithDefault>(135, "sql_types_per_column", result.sql_types_per_column); deserializer.ReadPropertyWithExplicitDefault(136, "columns_set", result.columns_set, false); - deserializer.ReadPropertyWithExplicitDefault>(137, "dialect_options.state_machine_options.comment", result.dialect_options.state_machine_options.comment, CSVOption()); + deserializer.ReadPropertyWithExplicitDefault>(137, "dialect_options.state_machine_options.comment", result.dialect_options.state_machine_options.comment, CSVOption('\0')); deserializer.ReadPropertyWithDefault(138, "dialect_options.rows_until_header", result.dialect_options.rows_until_header); return result; } diff --git a/src/storage/table/column_data.cpp b/src/storage/table/column_data.cpp index dd28a9bfe0b5..cc19a550166c 100644 --- a/src/storage/table/column_data.cpp +++ b/src/storage/table/column_data.cpp @@ -230,8 +230,12 @@ void ColumnData::UpdateInternal(TransactionData transaction, idx_t column_index, template idx_t ColumnData::ScanVector(TransactionData transaction, idx_t vector_index, ColumnScanState &state, Vector &result, idx_t target_scan) { - auto scan_count = ScanVector(state, result, target_scan, GetVectorScanType(state, target_scan)); - FetchUpdates(transaction, vector_index, result, scan_count, ALLOW_UPDATES, SCAN_COMMITTED); + auto scan_type = GetVectorScanType(state, target_scan); + auto scan_count = ScanVector(state, result, target_scan, scan_type); + if (scan_type != ScanVectorType::SCAN_ENTIRE_VECTOR) { + // if we are scanning an entire vector we cannot have updates + FetchUpdates(transaction, vector_index, result, scan_count, ALLOW_UPDATES, SCAN_COMMITTED); + } return scan_count; } diff --git a/src/storage/table/row_group_collection.cpp b/src/storage/table/row_group_collection.cpp index 62140832571c..cf60c6ebae41 100644 --- a/src/storage/table/row_group_collection.cpp +++ b/src/storage/table/row_group_collection.cpp @@ -417,6 +417,9 @@ void RowGroupCollection::FinalizeAppend(TransactionData transaction, TableAppend continue; } auto &local_stats = state.stats.GetStats(*local_stats_lock, col_idx); + if (!local_stats.HasDistinctStats()) { + continue; + } global_stats.DistinctStats().Merge(local_stats.DistinctStats()); } @@ -809,6 +812,7 @@ class VacuumTask : public BaseCheckpointTask { if (scan_chunk.size() == 0) { break; } + scan_chunk.Flatten(); idx_t remaining = scan_chunk.size(); while (remaining > 0) { idx_t append_count = @@ -962,12 +966,17 @@ void RowGroupCollection::Checkpoint(TableDataWriter &writer, TableStatistics &gl VacuumState vacuum_state; InitializeVacuumState(checkpoint_state, vacuum_state, segments); // schedule tasks + idx_t total_vacuum_tasks = 0; + auto &config = DBConfig::GetConfig(writer.GetDatabase()); for (idx_t segment_idx = 0; segment_idx < segments.size(); segment_idx++) { auto &entry = segments[segment_idx]; - auto vacuum_tasks = ScheduleVacuumTasks(checkpoint_state, vacuum_state, segment_idx); - if (vacuum_tasks) { - // vacuum tasks were scheduled - don't schedule a checkpoint task yet - continue; + if (total_vacuum_tasks < config.options.max_vacuum_tasks) { + auto vacuum_tasks = ScheduleVacuumTasks(checkpoint_state, vacuum_state, segment_idx); + if (vacuum_tasks) { + // vacuum tasks were scheduled - don't schedule a checkpoint task yet + total_vacuum_tasks++; + continue; + } } if (!entry.node) { // row group was vacuumed/dropped - skip diff --git a/src/storage/wal_replay.cpp b/src/storage/wal_replay.cpp index 85fe1bf368ef..890a5a617db5 100644 --- a/src/storage/wal_replay.cpp +++ b/src/storage/wal_replay.cpp @@ -87,10 +87,9 @@ class WriteAheadLogDeserializer { // compute and verify the checksum auto computed_checksum = Checksum(buffer.get(), size); if (stored_checksum != computed_checksum) { - throw SerializationException( - "Corrupt WAL file: entry at byte position %llu computed checksum %llu does not match " - "stored checksum %llu", - offset, computed_checksum, stored_checksum); + throw IOException("Corrupt WAL file: entry at byte position %llu computed checksum %llu does not match " + "stored checksum %llu", + offset, computed_checksum, stored_checksum); } return WriteAheadLogDeserializer(state_p, std::move(buffer), size, deserialize_only); } diff --git a/src/transaction/duck_transaction.cpp b/src/transaction/duck_transaction.cpp index 94299d225544..21e7b58df03e 100644 --- a/src/transaction/duck_transaction.cpp +++ b/src/transaction/duck_transaction.cpp @@ -17,6 +17,8 @@ #include "duckdb/main/client_data.hpp" #include "duckdb/main/attached_database.hpp" #include "duckdb/storage/storage_lock.hpp" +#include "duckdb/storage/table/data_table_info.hpp" +#include "duckdb/storage/table/scan_state.hpp" namespace duckdb { @@ -268,4 +270,25 @@ unique_ptr DuckTransaction::TryGetCheckpointLock() { return transaction_manager.TryUpgradeCheckpointLock(*write_lock); } +shared_ptr DuckTransaction::SharedLockTable(DataTableInfo &info) { + lock_guard l(active_locks_lock); + auto entry = active_locks.find(info); + if (entry != active_locks.end()) { + // found an existing lock + auto lock_weak_ptr = entry->second; + // check if it is expired + auto lock = lock_weak_ptr.lock(); + if (lock) { + // not expired - return it + return lock; + } + } + // no existing lock - obtain it + auto table_lock = info.GetSharedLock(); + auto checkpoint_lock = make_shared_ptr(std::move(table_lock)); + // insert it into the active locks and return it + active_locks.insert(make_pair(std::ref(info), checkpoint_lock)); + return checkpoint_lock; +} + } // namespace duckdb diff --git a/test/api/test_reset.cpp b/test/api/test_reset.cpp index cc0f5d0c77b1..3e58d3fa75af 100644 --- a/test/api/test_reset.cpp +++ b/test/api/test_reset.cpp @@ -105,7 +105,8 @@ OptionValueSet GetValueForOption(const string &name, LogicalTypeId type) { {"http_proxy_username", {"john"}}, {"http_proxy_password", {"doe"}}, {"http_logging_output", {"my_cool_outputfile"}}, - {"allocator_flush_threshold", {"4.0 GiB"}}}; + {"allocator_flush_threshold", {"4.0 GiB"}}, + {"allocator_bulk_deallocation_flush_threshold", {"4.0 GiB"}}}; // Every option that's not excluded has to be part of this map if (!value_map.count(name)) { switch (type) { diff --git a/test/fuzzer/duckfuzz/array_list_gather_cast.test b/test/fuzzer/duckfuzz/array_list_gather_cast.test new file mode 100644 index 000000000000..3eb256bd711f --- /dev/null +++ b/test/fuzzer/duckfuzz/array_list_gather_cast.test @@ -0,0 +1,21 @@ +# name: test/fuzzer/duckfuzz/array_list_gather_cast.test +# description: duckdb-fuzzer #3237 - Vector was not reset from cache before gathering +# group: [duckfuzz] + +statement ok +create table my_table as select * exclude(small_enum, medium_enum, large_enum) from test_all_types(); + +# should error, not crash +statement error +FROM main.my_table ref_0 + INNER JOIN + main.my_table AS ref_1 + ON ( + SELECT + ref_1.struct_of_arrays AS c6 + FROM main.my_table AS ref_2 + WHERE + EXISTS(SELECT ref_2.fixed_nested_varchar_array WHERE ref_1."union") + ) and ref_1."varchar" ~~ ref_1."varchar"; +---- +Conversion Error diff --git a/test/optimizer/estimated_cardinalities_are_in_logical_plan.test b/test/optimizer/estimated_cardinalities_are_in_logical_plan.test new file mode 100644 index 000000000000..c095ba33299d --- /dev/null +++ b/test/optimizer/estimated_cardinalities_are_in_logical_plan.test @@ -0,0 +1,22 @@ +# name: test/optimizer/estimated_cardinalities_are_in_logical_plan.test +# description: Make sure estimated cardinalities are respected +# group: [optimizer] + +require notwindows + +statement ok +create table t1 as select range a from range(100000); + +statement ok +create table t2 as select range b from range(500, 100000); + +statement ok +create table t3 as select range c from range(10000, 1000000); + +statement ok +pragma explain_output=OPTIMIZED_ONLY; + +query II +explain select * from t1, t2, t3 where a = b and b = c; +---- +logical_opt :.*COMPARISON_JOIN.*a = b.*~12260.* diff --git a/test/sql/aggregate/aggregates/test_median.test b/test/sql/aggregate/aggregates/test_median.test index 6502370773c8..1929973bd208 100644 --- a/test/sql/aggregate/aggregates/test_median.test +++ b/test/sql/aggregate/aggregates/test_median.test @@ -25,22 +25,22 @@ create table quantile as select range r, random() from range(10000) union all va query I SELECT median(r) FROM quantile ---- -4999 +4999.5 query I SELECT median(r) FROM quantile ---- -4999 +4999.5 query R SELECT median(r::float) FROM quantile ---- -4999 +4999.5 query R SELECT median(r::double) FROM quantile ---- -4999 +4999.5 query I SELECT median(r::tinyint) FROM quantile where r < 100 @@ -50,22 +50,22 @@ SELECT median(r::tinyint) FROM quantile where r < 100 query I SELECT median(r::smallint) FROM quantile ---- -4999 +4999.5 query I SELECT median(r::integer) FROM quantile ---- -4999 +4999.5 query I SELECT median(r::bigint) FROM quantile ---- -4999 +4999.5 query I SELECT median(r::hugeint) FROM quantile ---- -4999 +4999.5 query I SELECT median(r::decimal(10,2)) FROM quantile diff --git a/test/sql/catalog/function/test_recursive_macro.test b/test/sql/catalog/function/test_recursive_macro.test index 71bdcf05a3ec..3fc2c89301ac 100644 --- a/test/sql/catalog/function/test_recursive_macro.test +++ b/test/sql/catalog/function/test_recursive_macro.test @@ -11,12 +11,12 @@ CREATE MACRO "sum"(x) AS (CASE WHEN sum(x) IS NULL THEN 0 ELSE sum(x) END); statement error SELECT sum(1); ---- -Binder Error: Maximum recursion depth exceeded +Max expression depth limit statement error SELECT sum(1) WHERE 42=0 ---- -Binder Error: Maximum recursion depth exceeded +Max expression depth limit statement ok DROP MACRO sum diff --git a/test/sql/catalog/function/test_recursive_macro_no_dependency.test b/test/sql/catalog/function/test_recursive_macro_no_dependency.test index 0e608309be1f..59017e367005 100644 --- a/test/sql/catalog/function/test_recursive_macro_no_dependency.test +++ b/test/sql/catalog/function/test_recursive_macro_no_dependency.test @@ -8,12 +8,12 @@ CREATE MACRO "sum"(x) AS (CASE WHEN sum(x) IS NULL THEN 0 ELSE sum(x) END); statement error SELECT sum(1); ---- -Binder Error: Maximum recursion depth exceeded +Max expression depth limit statement error SELECT sum(1) WHERE 42=0 ---- -Binder Error: Maximum recursion depth exceeded +Max expression depth limit statement ok DROP MACRO sum diff --git a/test/sql/copy/csv/test_segfault.test b/test/sql/copy/csv/test_segfault.test new file mode 100644 index 000000000000..1417c2b2bfdd --- /dev/null +++ b/test/sql/copy/csv/test_segfault.test @@ -0,0 +1,189 @@ +# name: test/sql/copy/csv/test_segfault.test +# description: Test CSV that is segfaulting +# group: [csv] + +statement ok +PRAGMA enable_verification + +statement error +from 'data/csv/fuzzing/0.csv' +---- +It was not possible to automatically detect the CSV Parsing + +statement error +from 'data/csv/fuzzing/1.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/2.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement ok +from 'data/csv/fuzzing/3.csv' + +statement ok +from 'data/csv/fuzzing/4.csv' + +statement error +from 'data/csv/fuzzing/5.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/6.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/7.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/8.csv' +---- +Invalid unicode (byte sequence mismatch) detected. + +statement error +from 'data/csv/fuzzing/9.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/10.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/11.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/12.csv' +---- +Invalid unicode (byte sequence mismatch) detected. + +statement ok +from 'data/csv/fuzzing/13.csv' + +statement error +from 'data/csv/fuzzing/14.csv' +---- +Invalid unicode (byte sequence mismatch) detected. + +statement error +from 'data/csv/fuzzing/15.csv' +---- +Invalid unicode (byte sequence mismatch) detected. + +statement error +from 'data/csv/fuzzing/16.csv' +---- +Invalid unicode (byte sequence mismatch) detected. + +statement ok +from 'data/csv/fuzzing/17.csv' + +statement error +from 'data/csv/fuzzing/18.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement ok +from 'data/csv/fuzzing/19.csv' + +statement error +from 'data/csv/fuzzing/20.csv' +---- +Invalid unicode (byte sequence mismatch) detected. + +statement ok +from 'data/csv/fuzzing/21.csv' + + +statement error +from 'data/csv/fuzzing/22.csv' +---- +Invalid unicode (byte sequence mismatch) detected. + +statement error +from 'data/csv/fuzzing/23.csv' +---- +Invalid unicode (byte sequence mismatch) detected. + +statement error +from 'data/csv/fuzzing/24.csv' +---- +Invalid unicode (byte sequence mismatch) detected. + +statement error +from 'data/csv/fuzzing/25.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/26.csv' +---- +Invalid unicode (byte sequence mismatch) detected. + +statement error +from 'data/csv/fuzzing/27.csv' +---- +Invalid unicode (byte sequence mismatch) detected. + +statement error +from 'data/csv/fuzzing/28.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/29.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/30.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement ok +from 'data/csv/fuzzing/31.csv' + +statement error +from 'data/csv/fuzzing/32.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/33.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/34.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/35.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/36.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/37.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + +statement error +from 'data/csv/fuzzing/38.csv' +---- +It was not possible to automatically detect the CSV Parsing dialect/types + diff --git a/test/sql/copy/csv/test_sniff_csv.test b/test/sql/copy/csv/test_sniff_csv.test index 1b4ba47f0b06..ed65b4f06796 100644 --- a/test/sql/copy/csv/test_sniff_csv.test +++ b/test/sql/copy/csv/test_sniff_csv.test @@ -27,7 +27,7 @@ FROM sniff_csv('data/csv/real/lineitem_sample.csv'); statement error FROM sniff_csv('data/csv/real/non_ecziste.csv'); ---- -Cannot open file "data/csv/real/non_ecziste.csv": No such file or directory +No files found that match the pattern "data/csv/real/non_ecziste.csv" # Test different sample sizes diff --git a/test/sql/copy/csv/test_sniff_csv_options.test b/test/sql/copy/csv/test_sniff_csv_options.test index e19e72adcf1d..da8cdaa66f03 100644 --- a/test/sql/copy/csv/test_sniff_csv_options.test +++ b/test/sql/copy/csv/test_sniff_csv_options.test @@ -118,7 +118,7 @@ FROM read_csv('data/csv/autotypecandidates.csv', auto_detect=false, delim='|', q statement error FROM sniff_csv('data/csv/hive-partitioning/simple/*/*/test.csv'); ---- -sniff_csv does not operate on globs yet +Not implemented Error: sniff_csv does not operate on more than one file yet # don't accept madeup options statement error diff --git a/test/sql/copy/csv/test_sniff_httpfs.test b/test/sql/copy/csv/test_sniff_httpfs.test new file mode 100644 index 000000000000..ea36b9be17ab --- /dev/null +++ b/test/sql/copy/csv/test_sniff_httpfs.test @@ -0,0 +1,14 @@ +# name: test/sql/copy/csv/test_sniff_httpfs.test +# description: Test sniff_csv functions over httpfs with auto-detection on compression +# group: [csv] + +require httpfs + +statement ok +PRAGMA enable_verification + +statement ok +from sniff_csv('https://github.com/duckdb/duckdb/raw/main/data/csv/who.csv.gz'); + +statement ok +from sniff_csv('https://github.com/duckdb/duckdb/raw/main/data/csv/who.csv.gz?v=1'); diff --git a/test/sql/function/array/array_cosine_distance.test b/test/sql/function/array/array_cosine_distance.test new file mode 100644 index 000000000000..6c29f3965d0c --- /dev/null +++ b/test/sql/function/array/array_cosine_distance.test @@ -0,0 +1,53 @@ +# name: test/sql/function/array/array_cosine_distance.test +# group: [array] + +statement ok +PRAGMA enable_verification + +foreach type FLOAT DOUBLE + +# The distance between a vector and itself should be 0 +query I +SELECT array_cosine_distance([1, 2, 3]::${type}[3], [1, 2, 3]::${type}[3]); +---- +0.0 + +# Opposite vectors should have a distance of 2 +query I +SELECT array_cosine_distance([2, 2, 2]::${type}[3], [-2, -2, -2]::${type}[3]); +---- +2.0 + +statement ok +CREATE OR REPLACE TABLE arrays (l ${type}[3]); + +statement ok +INSERT INTO arrays VALUES ([1, 2, 3]), ([4, 5, 6]), ([7, 8, 9]), ([-1, -2, -3]), (NULL); + +query I +SELECT array_cosine_distance(l, [1, 2, 3]::${type}[3]) FROM arrays; +---- +0.0 +0.02536815 +0.04058805 +2.0 +NULL + +statement error +SELECT array_cosine_distance([1, NULL, 3]::${type}[3], [1, 2, 3]::${type}[3]); +---- +left argument can not contain NULL values + +statement error +SELECT array_cosine_distance([1, 2, 3]::${type}[3], [1, NULL, 3]::${type}[3]); +---- +right argument can not contain NULL values + +statement error +SELECT array_cosine_distance([1, 2, 3]::${type}[3], [1, 2, 3, 4]::${type}[4]); +---- +array_cosine_distance: Array arguments must be of the same size + + + +endloop diff --git a/test/sql/function/interval/test_date_part.test b/test/sql/function/interval/test_date_part.test index 3d49242183a2..66411952f0f9 100644 --- a/test/sql/function/interval/test_date_part.test +++ b/test/sql/function/interval/test_date_part.test @@ -56,10 +56,11 @@ SELECT date_part('seconds', i) FROM intervals; query I SELECT date_part('epoch', i) FROM intervals; ---- -63115200 -41925600 -3628800 -2066 +63115200.0 +41925600.0 +3628800.0 +2066.3434 + query I SELECT date_part(s, i) FROM intervals; diff --git a/test/sql/function/interval/test_extract.test b/test/sql/function/interval/test_extract.test index 2ac7bdc87f97..c179f423cfdb 100644 --- a/test/sql/function/interval/test_extract.test +++ b/test/sql/function/interval/test_extract.test @@ -98,10 +98,10 @@ SELECT EXTRACT(yearweek FROM i) FROM intervals query I SELECT EXTRACT(epoch FROM i) FROM intervals ---- -63115200 -41925600 -3628800 -2066 +63115200.0 +41925600.0 +3628800.0 +2066.3434 NULL query I diff --git a/test/sql/function/list/list_cosine_similarity.test b/test/sql/function/list/list_cosine_similarity.test index 4033215aa2c8..795a72c78bc4 100644 --- a/test/sql/function/list/list_cosine_similarity.test +++ b/test/sql/function/list/list_cosine_similarity.test @@ -36,11 +36,16 @@ right argument can not contain NULL values statement error SELECT list_cosine_similarity([1, 2, 3]::${type}[], [1, 2, 3, 4]::${type}[]); ---- -list dimensions must be equal, got left length 3 and right length 4 +list dimensions must be equal, got left length '3' and right length '4' -statement error +query I SELECT list_cosine_similarity([], []); ---- -The cosine similarity for empty vectors is not defined +NULL + +query I +SELECT list_cosine_distance([1, 2, 3]::${type}[], [1, 2, 3]::${type}[]) = 1 - list_cosine_similarity([1, 2, 3]::${type}[], [1, 2, 3]::${type}[]); +---- +true endloop diff --git a/test/sql/function/list/list_distance.test b/test/sql/function/list/list_distance.test index 6bcb42284948..a68bfb33bed5 100644 --- a/test/sql/function/list/list_distance.test +++ b/test/sql/function/list/list_distance.test @@ -37,7 +37,7 @@ right argument can not contain NULL values statement error SELECT list_distance([1, 2, 3]::${type}[], [1, 2, 3, 4]::${type}[]); ---- -list dimensions must be equal, got left length 3 and right length 4 +Invalid Input Error: list_distance: list dimensions must be equal, got left length '3' and right length '4' query I SELECT list_distance([], []); diff --git a/test/sql/function/list/list_inner_product.test b/test/sql/function/list/list_inner_product.test index e1f76969b7d0..8f1ae7f20e13 100644 --- a/test/sql/function/list/list_inner_product.test +++ b/test/sql/function/list/list_inner_product.test @@ -42,7 +42,11 @@ right argument can not contain NULL values statement error SELECT list_inner_product([1, 2, 3]::${type}[], [1, 2, 3, 4]::${type}[]); ---- -list dimensions must be equal, got left length 3 and right length 4 +Invalid Input Error: list_inner_product: list dimensions must be equal, got left length '3' and right length '4' +query I +SELECT list_negative_inner_product([1,2,3]::${type}[], [1,2,3]::${type}[]) = -list_inner_product([1,2,3]::${type}[], [1,2,3]::${type}[]); +---- +true endloop diff --git a/test/sql/function/string/test_jaro_winkler.test b/test/sql/function/string/test_jaro_winkler.test index 3b170d92b431..31f7127cc3b8 100644 --- a/test/sql/function/string/test_jaro_winkler.test +++ b/test/sql/function/string/test_jaro_winkler.test @@ -91,22 +91,22 @@ select jaro_winkler_similarity('foo', 'foo') query T select jaro_winkler_similarity('foo', 'foo ') ---- -0.94 +0.9416666666666667 query T select jaro_winkler_similarity('foo', 'foo ') ---- -0.91 +0.9066666666666667 query T select jaro_winkler_similarity('foo', ' foo ') ---- -0.87 +0.8666666666666667 query T select jaro_winkler_similarity('foo', ' foo') ---- -0.51 +0.5111111111111111 query T select jaro_winkler_similarity('', 'a') @@ -121,7 +121,7 @@ select jaro_winkler_similarity('aaapppp', '') query T select jaro_winkler_similarity('frog', 'fog') ---- -0.93 +0.9249999999999999 query T select jaro_winkler_similarity('fly', 'ant') @@ -131,12 +131,12 @@ select jaro_winkler_similarity('fly', 'ant') query T select jaro_winkler_similarity('elephant', 'hippo') ---- -0.44 +0.44166666666666665 query T select jaro_winkler_similarity('hippo', 'elephant') ---- -0.44 +0.44166666666666665 query T select jaro_winkler_similarity('hippo', 'zzzzzzzz') @@ -151,7 +151,7 @@ select jaro_winkler_similarity('hello', 'hallo') query T select jaro_winkler_similarity('ABC Corporation', 'ABC Corp') ---- -0.91 +0.9066666666666666 # this is 0.95 in the apache commons library but 0.8648324514991181 in ours query T diff --git a/test/sql/function/timestamp/test_icu_makedate.test b/test/sql/function/timestamp/test_icu_makedate.test index 3d712cbc6274..dde6de64dd75 100644 --- a/test/sql/function/timestamp/test_icu_makedate.test +++ b/test/sql/function/timestamp/test_icu_makedate.test @@ -332,44 +332,12 @@ Canada/Newfoundland 2021-05-02 05:11:49.5-07 2021-05-02 00:41:49.5-07 US/Hawaii 2021-12-01 05:54:48.123456-08 2021-12-01 07:54:48.123456-08 NULL NULL NULL -# Invalid zone - ICU defaults to GMT in this situation -query II +# Invalid zone - +statement error SELECT ts, make_timestamptz(yyyy, mm, dd, hr, mn, ss, 'Europe/Duck') mts FROM timeparts; ---- -1001-03-15 (BC) 12:45:42-07:52 1001-03-15 (BC) 04:52:44-07:52 -0044-03-15 (BC) 12:45:42-07:52 0044-03-15 (BC) 04:52:44-07:52 -1962-07-31 05:20:48.123456-07 1962-07-30 22:20:48.123456-07 -1968-12-31 17:03:20.45432-08 1968-12-31 09:03:20.45432-08 -1991-12-31 17:01:01.4-08 1991-12-31 09:01:01.4-08 -1991-12-31 17:01:02.2-08 1991-12-31 09:01:02.2-08 -1991-12-31 17:01:02.4-08 1991-12-31 09:01:02.4-08 -1993-08-14 01:22:33-07 1993-08-13 18:22:33-07 -1993-08-14 01:22:33.42-07 1993-08-13 18:22:33.42-07 -2001-04-20 07:42:11-07 2001-04-20 00:42:11-07 -2001-04-20 07:42:11.123-07 2001-04-20 00:42:11.123-07 -2004-01-31 04:00:00.00005-08 2004-01-30 20:00:00.00005-08 -2004-01-31 04:00:00.05-08 2004-01-30 20:00:00.05-08 -2004-02-01 04:00:00.00005-08 2004-01-31 20:00:00.00005-08 -2004-02-01 04:00:00.05-08 2004-01-31 20:00:00.05-08 -2004-02-29 05:05:47.123456-08 2004-02-28 21:05:47.123456-08 -2007-12-31 16:00:01.5-08 2007-12-31 08:00:01.5-08 -2007-12-31 16:00:01.594-08 2007-12-31 08:00:01.594-08 -2007-12-31 16:00:01.794-08 2007-12-31 08:00:01.794-08 -2007-12-31 16:00:01.88926-08 2007-12-31 08:00:01.88926-08 -2007-12-31 16:00:01.894-08 2007-12-31 08:00:01.894-08 -2007-12-31 16:00:01.98926-08 2007-12-31 08:00:01.98926-08 -2007-12-31 16:00:01.99926-08 2007-12-31 08:00:01.99926-08 -2007-12-31 16:00:11.1-08 2007-12-31 08:00:11.1-08 -2019-01-05 20:03:02.123456-08 2019-01-05 12:03:02.123456-08 -2019-01-05 20:03:02.5-08 2019-01-05 12:03:02.5-08 -2019-12-31 16:00:01.88926-08 2019-12-31 08:00:01.88926-08 -2020-12-31 13:25:58.745232-08 2020-12-31 05:25:58.745232-08 -2021-04-15 07:55:17.915-07 2021-04-15 00:55:17.915-07 -2021-04-15 07:55:17.915-07 2021-04-15 00:55:17.915-07 -2021-05-02 05:11:49.5-07 2021-05-01 22:11:49.5-07 -2021-12-01 05:54:48.123456-08 2021-11-30 21:54:48.123456-08 -NULL NULL +Unknown TimeZone 'Europe/Duck' # Cast to DATE query II @@ -417,10 +385,12 @@ WITH all_types AS ( ) SELECT make_timestamptz( CAST(century(CAST(a."interval" AS INTERVAL)) AS BIGINT), - CAST(a."bigint" AS BIGINT), CAST(a."bigint" AS BIGINT), - CAST(a."bigint" AS BIGINT), CAST(a."bigint" AS BIGINT), + CAST(a."bigint" AS BIGINT), + CAST(a."bigint" AS BIGINT), + CAST(a."bigint" AS BIGINT), + CAST(a."bigint" AS BIGINT), CAST(txid_current() AS BIGINT), - CAST(CAST(COALESCE(a."bigint", a."bigint") AS BIGINT) AS VARCHAR)) + 'UTC') FROM all_types a; ---- Overflow in subtraction diff --git a/test/sql/index/art/scan/test_art_scan_coverage.test b/test/sql/index/art/scan/test_art_scan_coverage.test index 38cf70ede899..f8d55393afba 100644 --- a/test/sql/index/art/scan/test_art_scan_coverage.test +++ b/test/sql/index/art/scan/test_art_scan_coverage.test @@ -87,3 +87,20 @@ DELETE FROM tab1 WHERE ((col0 > 32) AND col0 < 87) query I rowsort label-empty SELECT pk FROM tab1 WHERE ((col0 > 32) AND col0 < 87) ---- + +# Issue #13785 + +statement ok +CREATE TABLE t0_varchar(c0 VARCHAR); + +statement ok +INSERT INTO t0_varchar(c0) VALUES ('a'), ('a'); + +statement ok +CREATE INDEX t0i0_idx ON t0_varchar(c0 ); + +query I +SELECT c0 FROM t0_varchar WHERE t0_varchar.c0 <= 'a'; +---- +a +a diff --git a/test/sql/json/issues/issue13725.test b/test/sql/json/issues/issue13725.test new file mode 100644 index 000000000000..62664f600995 --- /dev/null +++ b/test/sql/json/issues/issue13725.test @@ -0,0 +1,36 @@ +# name: test/sql/json/issues/issue13725.test +# description: Test issue 13725 - Using both hive_partitioning and hive_types in read_json_objects intermittently segfaults +# group: [issues] + +require json + +# path slashes +require notwindows + +query III +select * +from read_json_objects('data/json/13725/month=*/*.json', hive_partitioning = true, format = auto, hive_types = {'month': int}, filename = true) +where month = 7; +---- +{"hello": "there"} data/json/13725/month=07/mytest.json 7 + +query I +select count(*) +from read_json_objects('data/json/13725/month=*/*.json', hive_partitioning = true, format = auto, hive_types = {'month': int}, filename = true) +where month = 7; +---- +1 + +query III +select * +from read_json('data/json/13725/month=*/*.json', hive_partitioning = true, format = auto, hive_types = {'month': int}, filename = true) +where month = 7; +---- +there data/json/13725/month=07/mytest.json 7 + +query I +select count(*) +from read_json('data/json/13725/month=*/*.json', hive_partitioning = true, format = auto, hive_types = {'month': int}, filename = true) +where month = 7; +---- +1 diff --git a/test/sql/parallelism/interquery/tpch_concurrent_checkpoints.test_slow b/test/sql/parallelism/interquery/tpch_concurrent_checkpoints.test_slow new file mode 100644 index 000000000000..c48dce243576 --- /dev/null +++ b/test/sql/parallelism/interquery/tpch_concurrent_checkpoints.test_slow @@ -0,0 +1,45 @@ +# name: test/sql/parallelism/interquery/tpch_concurrent_checkpoints.test_slow +# description: Run queries that reference the same table multiple times while doing checkpoints +# group: [interquery] + +require tpch + +statement ok +CALL dbgen(sf=0.1); + +concurrentloop threadid 0 5 + +loop i 0 20 + +onlyif threadid=0 +query I +INSERT INTO lineitem SELECT * FROM lineitem LIMIT 1000 +---- +1000 + +onlyif threadid=0 +query I +INSERT INTO orders SELECT * FROM orders LIMIT 1000 +---- +1000 + +onlyif threadid=0 +query I +CHECKPOINT + +endloop + +loop i 0 50 + +skipif threadid=0 +statement ok +SELECT COUNT(*) +FROM lineitem +WHERE l_orderkey IN (SELECT l_orderkey FROM lineitem WHERE l_shipdate >= DATE '1995-01-01') AND + l_partkey IN (SELECT l_partkey FROM lineitem WHERE l_returnflag='R') + +endloop + +endloop + + diff --git a/test/sql/parser/expression_depth_limit.test b/test/sql/parser/expression_depth_limit.test index 3c7a84339af0..8f541d09e6df 100644 --- a/test/sql/parser/expression_depth_limit.test +++ b/test/sql/parser/expression_depth_limit.test @@ -6,11 +6,12 @@ statement ok SELECT (1+(1+(1+(1+(1+(1+(1+1))))))); statement ok -SET max_expression_depth TO 3; +SET max_expression_depth TO 7; statement error SELECT (1+(1+(1+(1+(1+(1+(1+1))))))); ---- +expression depth limit statement ok SET max_expression_depth TO 1000; diff --git a/test/sql/setops/union_shared_scan.test b/test/sql/setops/union_shared_scan.test index 027979a7582e..e5a7cd2c9d44 100644 --- a/test/sql/setops/union_shared_scan.test +++ b/test/sql/setops/union_shared_scan.test @@ -21,7 +21,7 @@ UNION ALL SELECT AVG(i) + SUM(i) FROM tbl ORDER BY 1 ---- -5000 -10000 -49995000 -50000000 +4999.5 +10000.0 +49995000.0 +49999999.5 diff --git a/test/sql/show_select/describe_subquery.test b/test/sql/show_select/describe_subquery.test index 39766e814ce7..e167471a1f95 100644 --- a/test/sql/show_select/describe_subquery.test +++ b/test/sql/show_select/describe_subquery.test @@ -38,3 +38,6 @@ null key default extra + +statement ok +(DESCRIBE (values(42))) UNION ALL (DESCRIBE (values(42))); diff --git a/test/sql/storage/temp_directory/offloading_block_files.test_slow b/test/sql/storage/temp_directory/offloading_block_files.test_slow index 3eec77efb8a3..240660afd852 100644 --- a/test/sql/storage/temp_directory/offloading_block_files.test_slow +++ b/test/sql/storage/temp_directory/offloading_block_files.test_slow @@ -1,19 +1,24 @@ # name: test/sql/storage/temp_directory/offloading_block_files.test_slow # group: [temp_directory] +# For smaller block sizes, the total meta data size increases. +# On the GitHub runners, this test OOMs on the first query instead of the second. +# The memory usage and database size is around 7.4 GiB after table creation. +# In comparison, it is 3.2 GiB for the default block size. +# FIXME: Investigate why the database size is twice the size of the default block size. +require block_size 262144 + load __TEST_DIR__/offloading_block_files.db +# 500M row table. statement ok -create table counting2 as - from range(100) t1(i) - cross join range(100) t2(j) - cross join range(100) t3(k) - cross join range(100) t4(l) - cross join range(5) t5(m) - select - row_number() over () as i, - random() as random_value - ; +CREATE TABLE tbl AS FROM + range(100) t1(i) + CROSS JOIN range(100) t2(j) + CROSS JOIN range(100) t3(k) + CROSS JOIN range(100) t4(l) + CROSS JOIN range(5) t5(m) + SELECT row_number() OVER () AS i, random() AS random_value; statement ok SET max_temp_directory_size = '1GB'; @@ -22,6 +27,6 @@ statement ok SET memory_limit = '1GB'; statement error -select * from counting2 order by random_value; +SELECT * FROM tbl ORDER BY random_value; ---- Out of Memory Error: failed to offload data block of size diff --git a/test/sql/storage/vacuum/vacuum_partial_deletes.test_slow b/test/sql/storage/vacuum/vacuum_partial_deletes.test_slow index 2aac9f813cf7..6c1ee2986489 100644 --- a/test/sql/storage/vacuum/vacuum_partial_deletes.test_slow +++ b/test/sql/storage/vacuum/vacuum_partial_deletes.test_slow @@ -10,6 +10,9 @@ CREATE TABLE integers(i INTEGER); statement ok INSERT INTO integers SELECT * FROM range(1000000); +statement ok +SET max_vacuum_tasks=99 + query I SELECT SUM(i) FROM integers WHERE i%2<>0 ---- diff --git a/test/sql/storage/wal_torn_write.cpp b/test/sql/storage/wal_torn_write.cpp index 938d3d4f5b3d..2d420d585f66 100644 --- a/test/sql/storage/wal_torn_write.cpp +++ b/test/sql/storage/wal_torn_write.cpp @@ -107,11 +107,22 @@ TEST_CASE("Test WAL checksums", "[storage][.]") { } FlipWALByte(lfs, storage_wal, i); { - // reload and make sure table A is there, and table B is not there - DuckDB db(storage_database, config.get()); - Connection con(db); - REQUIRE_NO_FAIL(con.Query("FROM A")); - REQUIRE_FAIL(con.Query("FROM B")); + // flipping a byte in the checksum leads to an IOException + // flipping a byte in the size of a WAL entry leads to a torn write + // we succeed on either of these cases here + try { + DuckDB db(storage_database, config.get()); + Connection con(db); + REQUIRE_NO_FAIL(con.Query("FROM A")); + REQUIRE_FAIL(con.Query("FROM B")); + } catch (std::exception &ex) { + ErrorData error(ex); + if (error.Type() == ExceptionType::IO) { + REQUIRE(1 == 1); + } else { + throw; + } + } } } DeleteDatabase(storage_database); diff --git a/test/sql/tpch/tpch_power_test.test_slow b/test/sql/tpch/tpch_power_test.test_slow new file mode 100644 index 000000000000..9afba0b6c7e4 --- /dev/null +++ b/test/sql/tpch/tpch_power_test.test_slow @@ -0,0 +1,70 @@ +# name: test/sql/tpch/tpch_power_test.test_slow +# description: Test TPC-H power test +# group: [tpch] + +# TPC-H power test +mode skip + +require parquet + +require tpch + +statement ok +ATTACH '/Users/myth/Programs/duckdb-tpch-power-test/gen/sf100/tpch.duckdb' AS tpch + +statement ok +USE tpch + +statement ok +SELECT COUNT(*) FROM lineitem + +statement ok +SET wal_autocheckpoint='100MB' + +mode output_result + +concurrentloop threadid 0 8 + +statement ok +USE tpch + +# thread 1 is doing the refreshes +loop i 1 100 + +onlyif threadid=0 +statement ok +BEGIN + +onlyif threadid=0 +statement ok +INSERT INTO lineitem FROM read_parquet('/Users/myth/Programs/duckdb-tpch-power-test/gen/sf100/lineitem.tbl.u${i}.parquet') + +onlyif threadid=0 +statement ok +INSERT INTO orders FROM read_parquet('/Users/myth/Programs/duckdb-tpch-power-test/gen/sf100/orders.tbl.u${i}.parquet') + +onlyif threadid=0 +statement ok +DELETE FROM orders WHERE o_orderkey IN (SELECT column0 FROM read_parquet('/Users/myth/Programs/duckdb-tpch-power-test/gen/sf100/delete.${i}.parquet')) + +onlyif threadid=0 +statement ok +DELETE FROM lineitem WHERE l_orderkey IN (SELECT column0 FROM read_parquet('/Users/myth/Programs/duckdb-tpch-power-test/gen/sf100/delete.${i}.parquet')) + +onlyif threadid=0 +statement ok +COMMIT + +endloop + +# threads >1 are querying +loop qnr 1 22 + +skipif threadid=0 +statement ok +PRAGMA tpch(${qnr}) + +endloop + + +endloop diff --git a/test/sql/types/varint/test_varint_double.test b/test/sql/types/varint/test_varint_double.test index f308418bdeb3..7aa52a906f18 100644 --- a/test/sql/types/varint/test_varint_double.test +++ b/test/sql/types/varint/test_varint_double.test @@ -61,4 +61,4 @@ Type DOUBLE with value inf can't be cast to the destination type statement error select '1797693134862315708145274237317043567980705675258449965989174768031572607800285387605895586327668781715404589535143824642343213268894641827684675467035375169860499105765512820762454900903893289440758685084551339423045832369032229481658085593321233482747978262041447231687381771809192998812504040261841248583700'::varint::double ---- -Could not convert string \ No newline at end of file +Could not convert varint '1797693134862315708145274237317043567980705675258449965989174768031572607800285387605895586327668781715404589535143824642343213268894641827684675467035375169860499105765512820762454900903893289440758685084551339423045832369032229481658085593321233482747978262041447231687381771809192998812504040261841248583700' to Double \ No newline at end of file diff --git a/test/sql/upsert/test_generated_column.test b/test/sql/upsert/test_generated_column.test new file mode 100644 index 000000000000..00281a2d2a28 --- /dev/null +++ b/test/sql/upsert/test_generated_column.test @@ -0,0 +1,48 @@ +# name: test/sql/upsert/test_generated_column.test +# group: [upsert] + +# SET expression targets b (located after the virtual column) + +statement ok +CREATE TABLE t1 ( + a CHAR NOT NULL, + c CHAR GENERATED ALWAYS AS (a) VIRTUAL, + b INT, +); + +statement ok +CREATE UNIQUE INDEX t1_idx ON t1 (a); + +statement ok +INSERT INTO t1 VALUES ('a', 1) ON CONFLICT(a) DO UPDATE SET b = excluded.b; + +statement ok +INSERT INTO t1 VALUES ('a', 1) ON CONFLICT(a) DO UPDATE SET b = excluded.b; + +query III +select * from t1; +---- +a a 1 + +# The ON CONFLICT (a) is logically located after the virtual column + +statement ok +CREATE TABLE t2 ( + b INT, + c CHAR GENERATED ALWAYS AS (a) VIRTUAL, + a CHAR NOT NULL, +); + +statement ok +CREATE UNIQUE INDEX t2_idx ON t2 (a); + +statement ok +INSERT INTO t2 VALUES (1, 'a') ON CONFLICT(a) DO UPDATE SET b = excluded.b; + +statement ok +INSERT INTO t2 VALUES (1, 'a') ON CONFLICT(a) DO UPDATE SET b = excluded.b; + +query III +select * from t1; +---- +a a 1 diff --git a/test/sqlite/sqllogic_test_runner.cpp b/test/sqlite/sqllogic_test_runner.cpp index 762928175ac3..daf384750bee 100644 --- a/test/sqlite/sqllogic_test_runner.cpp +++ b/test/sqlite/sqllogic_test_runner.cpp @@ -893,7 +893,7 @@ void SQLLogicTestRunner::ExecuteFile(string script) { // file name idx_t filename_start_pos = input_path.find_last_of("/") + 1; - if (!StringUtil::EndsWith(input_path, ".gz")) { + if (!StringUtil::EndsWith(input_path, CompressionExtensionFromType(FileCompressionType::GZIP))) { parser.Fail("unzip: input has not a GZIP extension"); } string filename = input_path.substr(filename_start_pos, input_path.size() - filename_start_pos - 3); diff --git a/third_party/libpg_query/grammar/grammar.cpp b/third_party/libpg_query/grammar/grammar.cpp index 17304465ea05..b6ded12596a5 100644 --- a/third_party/libpg_query/grammar/grammar.cpp +++ b/third_party/libpg_query/grammar/grammar.cpp @@ -429,8 +429,8 @@ makeSetOp(PGSetOperation op, bool all, PGNode *larg, PGNode *rarg) n->op = op; n->all = all; - n->larg = (PGSelectStmt *) larg; - n->rarg = (PGSelectStmt *) rarg; + n->larg = larg; + n->rarg = rarg; return (PGNode *) n; } diff --git a/third_party/libpg_query/include/nodes/parsenodes.hpp b/third_party/libpg_query/include/nodes/parsenodes.hpp index 82302a18ff8f..19fdc6f5daa4 100755 --- a/third_party/libpg_query/include/nodes/parsenodes.hpp +++ b/third_party/libpg_query/include/nodes/parsenodes.hpp @@ -1287,8 +1287,8 @@ typedef struct PGSelectStmt { */ PGSetOperation op; /* type of set op */ bool all; /* ALL specified? */ - struct PGSelectStmt *larg; /* left child */ - struct PGSelectStmt *rarg; /* right child */ + struct PGNode *larg; /* left child */ + struct PGNode *rarg; /* right child */ /* Eventually add fields for CORRESPONDING spec here */ } PGSelectStmt; diff --git a/third_party/libpg_query/src_backend_parser_gram.cpp b/third_party/libpg_query/src_backend_parser_gram.cpp index 77f152c41223..5e3b28c0e2a0 100644 --- a/third_party/libpg_query/src_backend_parser_gram.cpp +++ b/third_party/libpg_query/src_backend_parser_gram.cpp @@ -31729,8 +31729,8 @@ makeSetOp(PGSetOperation op, bool all, PGNode *larg, PGNode *rarg) n->op = op; n->all = all; - n->larg = (PGSelectStmt *) larg; - n->rarg = (PGSelectStmt *) rarg; + n->larg = larg; + n->rarg = rarg; return (PGNode *) n; } diff --git a/tools/pythonpkg/README.md b/tools/pythonpkg/README.md index 19b51069fe72..722a4df6695e 100644 --- a/tools/pythonpkg/README.md +++ b/tools/pythonpkg/README.md @@ -1,4 +1,4 @@ -This is the DuckDB Python package +# DuckDB Python package ## Default installation @@ -30,11 +30,19 @@ To install in debug mode, set the environment variable `$DUCKDEBUG=1` (or some o Note that this will override any existing DuckDB installation you might have. You might also run into conflicts depending on your Python environment. In order to remedy that, it is possible to use virtualenv for installation, e.g. by running the following commands: ```bash +cd tools/pythonpkg virtualenv .venv --python=python3.12 source .venv/bin/activate python3 -m pip install . ``` +To test, run: + +```bash +cd ../.. +python3 -c "import duckdb; duckdb.sql('SELECT version() AS version').show()" +``` + ### Installing with make You can build using the make command with `BUILD_PYTHON` flag set. For example: diff --git a/tools/pythonpkg/duckdb/query_graph/__main__.py b/tools/pythonpkg/duckdb/query_graph/__main__.py index 6c1437e087fe..404d2162fe3c 100644 --- a/tools/pythonpkg/duckdb/query_graph/__main__.py +++ b/tools/pythonpkg/duckdb/query_graph/__main__.py @@ -99,7 +99,7 @@ def open_utf8(fpath: str, flags: str) -> object: def get_child_timings(top_node: object, query_timings: object) -> str: - node_timing = NodeTiming(top_node['name'], float(top_node['operator_timing'])) + node_timing = NodeTiming(top_node['operator_type'], float(top_node['operator_timing'])) query_timings.add_node_timing(node_timing) for child in top_node['children']: get_child_timings(child, query_timings) @@ -131,10 +131,9 @@ def get_node_body(name: str, result: str, cardinality: float, extra_info: str) - body += "
" new_name = name.replace("_", " ") body += f"

{new_name} ({result}s)

" - if extra_info: - extra_info = extra_info.replace("[INFOSEPARATOR]", "----") - extra_info = extra_info.replace("

", "
") - body += f"

{extra_info}

" + body += f"

----------------

" + body += f"

{extra_info}

" + body += f"

----------------

" body += f"

cardinality = {cardinality}

" # TODO: Expand on timing. Usually available from a detailed profiling body += "
" @@ -145,10 +144,15 @@ def get_node_body(name: str, result: str, cardinality: float, extra_info: str) - def generate_tree_recursive(json_graph: object) -> str: node_prefix_html = "
  • " node_suffix_html = "
  • " - node_body = get_node_body(json_graph["name"], + + extra_info = "" + for key in json_graph['extra_info']: + extra_info += f"{key}: {json_graph['extra_info'][key]}
    " + + node_body = get_node_body(json_graph["operator_type"], json_graph["operator_timing"], json_graph["operator_cardinality"], - json_graph["extra_info"].replace("\n", "
    ")) + extra_info) children_html = "" if len(json_graph['children']) >= 1: diff --git a/tools/pythonpkg/duckdb_python.cpp b/tools/pythonpkg/duckdb_python.cpp index ef6ae0923fa4..281619a15e4d 100644 --- a/tools/pythonpkg/duckdb_python.cpp +++ b/tools/pythonpkg/duckdb_python.cpp @@ -1090,7 +1090,7 @@ PYBIND11_MODULE(DUCKDB_PYTHON_LIB_NAME, m) { // NOLINT py::arg("database") = ":memory:", py::arg("read_only") = false, py::arg_v("config", py::dict(), "None")); m.def("tokenize", PyTokenize, "Tokenizes a SQL string, returning a list of (position, type) tuples that can be " - "used for e.g. syntax highlighting", + "used for e.g., syntax highlighting", py::arg("query")); py::enum_(m, "token_type", py::module_local()) .value("identifier", PySQLTokenType::PY_SQL_TOKEN_IDENTIFIER) diff --git a/tools/pythonpkg/scripts/sqllogictest_python.py b/tools/pythonpkg/scripts/sqllogictest_python.py index 91a2d1e77bf8..3b4b9f9c26c0 100644 --- a/tools/pythonpkg/scripts/sqllogictest_python.py +++ b/tools/pythonpkg/scripts/sqllogictest_python.py @@ -42,6 +42,11 @@ def __init__(self, build_directory: Optional[str] = None): 'test/sql/parser/invisible_spaces.test', # <-- Parser is getting tripped up on the invisible spaces 'test/sql/copy/csv/code_cov/csv_state_machine_invalid_utf.test', # <-- ConversionException is empty, see Python Mega Issue (duckdb-internal #1488) 'test/sql/copy/csv/test_csv_timestamp_tz.test', # <-- ICU is always loaded + 'test/fuzzer/duckfuzz/duck_fuzz_column_binding_tests.test', # <-- ICU is always loaded + 'test/sql/pragma/test_custom_optimizer_profiling.test', # Because of logic related to enabling 'restart' statement capabilities, this will not measure the right statement + 'test/sql/pragma/test_custom_profiling_settings.test', # Because of logic related to enabling 'restart' statement capabilities, this will not measure the right statement + 'test/sql/copy/csv/test_copy.test', # JSON is always loaded + 'test/sql/copy/csv/test_timestamptz_12926.test', # ICU is always loaded ] ) # TODO: get this from the `duckdb` package diff --git a/tools/pythonpkg/src/include/duckdb_python/pyfilesystem.hpp b/tools/pythonpkg/src/include/duckdb_python/pyfilesystem.hpp index b62d54b423a2..6f31c0fc4e07 100644 --- a/tools/pythonpkg/src/include/duckdb_python/pyfilesystem.hpp +++ b/tools/pythonpkg/src/include/duckdb_python/pyfilesystem.hpp @@ -41,6 +41,7 @@ class PythonFileHandle : public FileHandle { private: py::object handle; }; + class PythonFilesystem : public FileSystem { private: const vector protocols; diff --git a/tools/pythonpkg/src/native/python_objects.cpp b/tools/pythonpkg/src/native/python_objects.cpp index d9555e7dad2a..554f0c74818d 100644 --- a/tools/pythonpkg/src/native/python_objects.cpp +++ b/tools/pythonpkg/src/native/python_objects.cpp @@ -8,6 +8,7 @@ #include "duckdb/common/operator/cast_operators.hpp" #include "duckdb_python/pyconnection/pyconnection.hpp" #include "duckdb/common/operator/add.hpp" +#include "duckdb/common/types/varint.hpp" #include "duckdb/core_functions/to_interval.hpp" #include "datetime.h" // Python datetime initialize #1 @@ -678,6 +679,10 @@ py::object PythonObject::FromValue(const Value &val, const LogicalType &type, auto uuid_value = val.GetValueUnsafe(); return import_cache.uuid.UUID()(UUID::ToString(uuid_value)); } + case LogicalTypeId::VARINT: { + auto varint_value = val.GetValueUnsafe(); + return py::str(Varint::VarIntToVarchar(varint_value)); + } case LogicalTypeId::INTERVAL: { auto interval_value = val.GetValueUnsafe(); int64_t days = duckdb::Interval::DAYS_PER_MONTH * interval_value.months + interval_value.days; diff --git a/tools/pythonpkg/src/pyconnection.cpp b/tools/pythonpkg/src/pyconnection.cpp index e15f77439274..9f3c92a0f91c 100644 --- a/tools/pythonpkg/src/pyconnection.cpp +++ b/tools/pythonpkg/src/pyconnection.cpp @@ -874,6 +874,7 @@ unique_ptr DuckDBPyConnection::ReadJSON( auto_detect = true; } + py::gil_scoped_release gil; auto read_json_relation = make_shared_ptr(connection.context, name, std::move(options), auto_detect); if (read_json_relation == nullptr) { @@ -1383,6 +1384,7 @@ unique_ptr DuckDBPyConnection::ReadCSV(const py::object &name_ // Create the ReadCSV Relation using the 'options' + py::gil_scoped_release gil; auto read_csv_p = connection.ReadCSV(name, std::move(bind_parameters)); auto &read_csv = read_csv_p->Cast(); if (file_like_object_wrapper) { @@ -1436,15 +1438,18 @@ unique_ptr DuckDBPyConnection::RunQuery(const py::object &quer shared_ptr relation; if (py::none().is(params)) { // FIXME: currently we can't create relations with prepared parameters - auto statement_type = last_statement->type; - switch (statement_type) { - case StatementType::SELECT_STATEMENT: { - auto select_statement = unique_ptr_cast(std::move(last_statement)); - relation = connection.RelationFromQuery(std::move(select_statement), alias); - break; - } - default: - break; + { + py::gil_scoped_release gil; + auto statement_type = last_statement->type; + switch (statement_type) { + case StatementType::SELECT_STATEMENT: { + auto select_statement = unique_ptr_cast(std::move(last_statement)); + relation = connection.RelationFromQuery(std::move(select_statement), alias); + break; + } + default: + break; + } } } @@ -1548,6 +1553,7 @@ unique_ptr DuckDBPyConnection::FromParquet(const string &file_ } named_parameters["compression"] = Value(py::str(compression)); } + py::gil_scoped_release gil; return make_uniq(connection.TableFunction("parquet_scan", params, named_parameters)->Alias(name)); } @@ -1695,7 +1701,10 @@ void DuckDBPyConnection::Interrupt() { void DuckDBPyConnection::InstallExtension(const string &extension, bool force_install) { auto &connection = con.GetConnection(); - ExtensionHelper::InstallExtension(*connection.context, extension, force_install); + + ExtensionInstallOptions options; + options.force_install = force_install; + ExtensionHelper::InstallExtension(*connection.context, extension, options); } void DuckDBPyConnection::LoadExtension(const string &extension) { diff --git a/tools/pythonpkg/src/pyfilesystem.cpp b/tools/pythonpkg/src/pyfilesystem.cpp index 6a7c45ab2b73..988a27e96410 100644 --- a/tools/pythonpkg/src/pyfilesystem.cpp +++ b/tools/pythonpkg/src/pyfilesystem.cpp @@ -146,12 +146,14 @@ string PythonFilesystem::PathSeparator(const string &path) { return "/"; } int64_t PythonFilesystem::GetFileSize(FileHandle &handle) { + D_ASSERT(!py::gil_check()); // TODO: this value should be cached on the PythonFileHandle PythonGILWrapper gil; return py::int_(filesystem.attr("size")(handle.path)); } void PythonFilesystem::Seek(duckdb::FileHandle &handle, uint64_t location) { + D_ASSERT(!py::gil_check()); PythonGILWrapper gil; auto seek = PythonFileHandle::GetHandle(handle).attr("seek"); @@ -170,18 +172,21 @@ bool PythonFilesystem::CanHandleFile(const string &fpath) { return false; } void PythonFilesystem::MoveFile(const string &source, const string &dest, optional_ptr opener) { + D_ASSERT(!py::gil_check()); PythonGILWrapper gil; auto move = filesystem.attr("mv"); move(py::str(source), py::str(dest)); } void PythonFilesystem::RemoveFile(const string &filename, optional_ptr opener) { + D_ASSERT(!py::gil_check()); PythonGILWrapper gil; auto remove = filesystem.attr("rm"); remove(py::str(filename)); } time_t PythonFilesystem::GetLastModifiedTime(FileHandle &handle) { + D_ASSERT(!py::gil_check()); // TODO: this value should be cached on the PythonFileHandle PythonGILWrapper gil; @@ -190,6 +195,7 @@ time_t PythonFilesystem::GetLastModifiedTime(FileHandle &handle) { return py::int_(last_mod.attr("timestamp")()); } void PythonFilesystem::FileSync(FileHandle &handle) { + D_ASSERT(!py::gil_check()); PythonGILWrapper gil; PythonFileHandle::GetHandle(handle).attr("flush")(); @@ -198,11 +204,13 @@ bool PythonFilesystem::DirectoryExists(const string &directory, optional_ptr opener) { + D_ASSERT(!py::gil_check()); PythonGILWrapper gil; filesystem.attr("rm")(directory, py::arg("recursive") = true); } void PythonFilesystem::CreateDirectory(const string &directory, optional_ptr opener) { + D_ASSERT(!py::gil_check()); PythonGILWrapper gil; filesystem.attr("mkdir")(py::str(directory)); @@ -211,6 +219,7 @@ bool PythonFilesystem::ListFiles(const string &directory, const std::function o return false; } idx_t PythonFilesystem::SeekPosition(FileHandle &handle) { + D_ASSERT(!py::gil_check()); PythonGILWrapper gil; return py::int_(PythonFileHandle::GetHandle(handle).attr("tell")()); diff --git a/tools/pythonpkg/src/python_replacement_scan.cpp b/tools/pythonpkg/src/python_replacement_scan.cpp index 4c524cf2fa7b..4e42195a50ec 100644 --- a/tools/pythonpkg/src/python_replacement_scan.cpp +++ b/tools/pythonpkg/src/python_replacement_scan.cpp @@ -105,8 +105,6 @@ unique_ptr PythonReplacementScan::TryReplacementObject(const py::objec dependency->AddDependency("replacement_cache", PythonDependencyItem::Create(entry)); subquery->external_dependency = std::move(dependency); return std::move(subquery); - } else if ((arrow_type = DuckDBPyConnection::GetArrowType(entry)) != PyArrowObjectType::Invalid) { - CreateArrowScan(name, entry, *table_function, children, client_properties, arrow_type); } else if (PolarsDataFrame::IsDataFrame(entry)) { auto arrow_dataset = entry.attr("to_arrow")(); CreateArrowScan(name, arrow_dataset, *table_function, children, client_properties, PyArrowObjectType::Table); @@ -114,6 +112,8 @@ unique_ptr PythonReplacementScan::TryReplacementObject(const py::objec auto materialized = entry.attr("collect")(); auto arrow_dataset = materialized.attr("to_arrow")(); CreateArrowScan(name, arrow_dataset, *table_function, children, client_properties, PyArrowObjectType::Table); + } else if ((arrow_type = DuckDBPyConnection::GetArrowType(entry)) != PyArrowObjectType::Invalid) { + CreateArrowScan(name, entry, *table_function, children, client_properties, arrow_type); } else if ((numpytype = DuckDBPyConnection::IsAcceptedNumpyObject(entry)) != NumpyObjectType::INVALID) { string name = "np_" + StringUtil::GenerateRandomName(); py::dict data; // we will convert all the supported format to dict{"key": np.array(value)}. diff --git a/tools/pythonpkg/tests/fast/arrow/test_canonical_extensions.py b/tools/pythonpkg/tests/fast/arrow/test_canonical_extensions.py index 7f55f408f23d..6fa4806029c4 100644 --- a/tools/pythonpkg/tests/fast/arrow/test_canonical_extensions.py +++ b/tools/pythonpkg/tests/fast/arrow/test_canonical_extensions.py @@ -12,7 +12,10 @@ class TestCanonicalExtensionTypes(object): - def test_uuid(self, duckdb_cursor): + def test_uuid(self): + duckdb_cursor = duckdb.connect() + duckdb_cursor.execute("SET arrow_lossless_conversion = true") + pa.register_extension_type(UuidType()) storage_array = pa.array([uuid.uuid4().bytes for _ in range(4)], pa.binary(16)) @@ -27,7 +30,38 @@ def test_uuid(self, duckdb_cursor): pa.unregister_extension_type("arrow.uuid") - def test_uuid_exception(self, duckdb_cursor): + def test_uuid_from_duck(self): + duckdb_cursor = duckdb.connect() + duckdb_cursor.execute("SET arrow_lossless_conversion = true") + + pa.register_extension_type(UuidType()) + + arrow_table = duckdb_cursor.execute("select uuid from test_all_types()").fetch_arrow_table() + + assert arrow_table.to_pylist() == [ + {'uuid': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'}, + {'uuid': b'\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff'}, + {'uuid': None}, + ] + + assert duckdb_cursor.execute("FROM arrow_table").fetchall() == [ + (UUID('00000000-0000-0000-0000-000000000000'),), + (UUID('ffffffff-ffff-ffff-ffff-ffffffffffff'),), + (None,), + ] + + arrow_table = duckdb_cursor.execute( + "select '00000000-0000-0000-0000-000000000100'::UUID as uuid" + ).fetch_arrow_table() + + assert arrow_table.to_pylist() == [ + {'uuid': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x00'} + ] + assert duckdb_cursor.execute("FROM arrow_table").fetchall() == [(UUID('00000000-0000-0000-0000-000000000100'),)] + + pa.unregister_extension_type("arrow.uuid") + + def test_uuid_exception(self): class UuidTypeWrong(pa.ExtensionType): def __init__(self): pa.ExtensionType.__init__(self, pa.binary(4), "arrow.uuid") @@ -43,6 +77,9 @@ def __arrow_ext_deserialize__(self, storage_type, serialized): # metadata. return UuidTypeWrong() + duckdb_cursor = duckdb.connect() + duckdb_cursor.execute("SET arrow_lossless_conversion = true") + pa.register_extension_type(UuidTypeWrong()) storage_array = pa.array(['aaaa'], pa.binary(4)) @@ -104,7 +141,10 @@ def __arrow_ext_deserialize__(self, storage_type, serialized): duck_arrow = duckdb_cursor.execute('FROM arrow_table').arrow() pa.unregister_extension_type("arrow.json") - def test_uuid_no_def(self, duckdb_cursor): + def test_uuid_no_def(self): + duckdb_cursor = duckdb.connect() + duckdb_cursor.execute("SET arrow_lossless_conversion = true") + res_arrow = duckdb_cursor.execute("select uuid from test_all_types()").arrow() res_duck = duckdb_cursor.execute("from res_arrow").fetchall() assert res_duck == [ @@ -113,7 +153,26 @@ def test_uuid_no_def(self, duckdb_cursor): (None,), ] - def test_uuid_no_def_stream(self, duckdb_cursor): + def test_uuid_no_def_lossless(self): + duckdb_cursor = duckdb.connect() + res_arrow = duckdb_cursor.execute("select uuid from test_all_types()").arrow() + assert res_arrow.to_pylist() == [ + {'uuid': '00000000-0000-0000-0000-000000000000'}, + {'uuid': 'ffffffff-ffff-ffff-ffff-ffffffffffff'}, + {'uuid': None}, + ] + + res_duck = duckdb_cursor.execute("from res_arrow").fetchall() + assert res_duck == [ + ('00000000-0000-0000-0000-000000000000',), + ('ffffffff-ffff-ffff-ffff-ffffffffffff',), + (None,), + ] + + def test_uuid_no_def_stream(self): + duckdb_cursor = duckdb.connect() + duckdb_cursor.execute("SET arrow_lossless_conversion = true") + res_arrow = duckdb_cursor.execute("select uuid from test_all_types()").fetch_record_batch() res_duck = duckdb.execute("from res_arrow").fetchall() assert res_duck == [ @@ -137,15 +196,17 @@ def test_function(x): pa.unregister_extension_type("arrow.uuid") - def test_uuid_udf_unregistered(self, duckdb_cursor): + def test_uuid_udf_unregistered(self): + duckdb_cursor = duckdb.connect() + duckdb_cursor.execute("SET arrow_lossless_conversion = true") + def test_function(x): print(x.type.__class__) return x - con = duckdb.connect() - con.create_function('test', test_function, ['UUID'], 'UUID', type='arrow') + duckdb_cursor.create_function('test', test_function, ['UUID'], 'UUID', type='arrow') - rel = con.sql("select ? as x", params=[uuid.UUID('ffffffff-ffff-ffff-ffff-ffffffffffff')]) + rel = duckdb_cursor.sql("select ? as x", params=[uuid.UUID('ffffffff-ffff-ffff-ffff-ffffffffffff')]) with pytest.raises(duckdb.Error, match="It seems that you are using the UUID arrow canonical extension"): rel.project("test(x) from t").fetchall() @@ -196,7 +257,6 @@ def test_hugeint(self): pa.unregister_extension_type("duckdb.hugeint") def test_uhugeint(self, duckdb_cursor): - pa.register_extension_type(UHugeIntType()) storage_array = pa.array([b'\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff'], pa.binary(16)) diff --git a/tools/pythonpkg/tests/fast/pandas/test_datetime_timestamp.py b/tools/pythonpkg/tests/fast/pandas/test_datetime_timestamp.py index 57ccf565f56c..e3b265016c0a 100644 --- a/tools/pythonpkg/tests/fast/pandas/test_datetime_timestamp.py +++ b/tools/pythonpkg/tests/fast/pandas/test_datetime_timestamp.py @@ -105,18 +105,17 @@ def test_timestamp_timezone_negative_extreme(self, pandas, duckdb_cursor): def test_timestamp_timezone_positive_extreme(self, pandas, duckdb_cursor): duckdb_time = duckdb_cursor.sql( """ - SELECT timestamp '2021-12-31 23:00:00' AT TIME ZONE 'kea_CV' as "0" + SELECT timestamp '2021-12-31 23:00:00' AT TIME ZONE 'Etc/GMT-14' as "0" """ ).df() - # 'kea_CV' is 20 hours ahead of UTC - offset = datetime.timedelta(hours=20) + offset = datetime.timedelta(hours=14) timezone = datetime.timezone(offset) df_in = pandas.DataFrame( { 0: pandas.Series( - data=[datetime.datetime(year=2022, month=1, day=1, hour=19, tzinfo=timezone)], dtype='object' + data=[datetime.datetime(year=2021, month=12, day=31, hour=23, tzinfo=timezone)], dtype='object' ) } ) diff --git a/tools/pythonpkg/tests/fast/test_filesystem.py b/tools/pythonpkg/tests/fast/test_filesystem.py index 1bc4dcfb761a..cbf6dac61e8f 100644 --- a/tools/pythonpkg/tests/fast/test_filesystem.py +++ b/tools/pythonpkg/tests/fast/test_filesystem.py @@ -13,7 +13,7 @@ importorskip('fsspec', '2022.11.0') from fsspec import filesystem, AbstractFileSystem from fsspec.implementations.memory import MemoryFileSystem -from fsspec.implementations.local import LocalFileOpener +from fsspec.implementations.local import LocalFileOpener, LocalFileSystem FILENAME = 'integers.csv' @@ -257,3 +257,32 @@ def test_read_hive_partition_with_columns_written( # hive partitioning: no cast to int duckdb_cursor.execute(query + ', HIVE_PARTITIONING=1' + ', HIVE_TYPES_AUTOCAST=0' + ');') assert duckdb_cursor.fetchall() == [(2, '2')] + + def test_parallel_union_by_name(self, tmp_path): + pa = importorskip('pyarrow') + pq = importorskip('pyarrow.parquet') + fsspec = importorskip('fsspec') + + table1 = pa.Table.from_pylist( + [ + {'time': 1719568210134107692, 'col1': 1}, + ] + ) + table1_path = tmp_path / "table1.parquet" + pa.parquet.write_table(table1, table1_path) + + table2 = pa.Table.from_pylist( + [ + {'time': 1719568210134107692, 'col1': 1}, + ] + ) + table2_path = tmp_path / "table2.parquet" + pq.write_table(table2, table2_path) + + c = duckdb.connect() + c.register_filesystem(LocalFileSystem()) + + q = f"SELECT * FROM read_parquet('file://{tmp_path}/table*.parquet', union_by_name = TRUE) ORDER BY time DESC LIMIT 1" + + res = c.sql(q).fetchall() + assert res == [(1719568210134107692, 1)]