Skip to content

Commit

Permalink
Merge branch 'master' into fts_attach
Browse files Browse the repository at this point in the history
  • Loading branch information
lnkuiper committed Jul 14, 2023
2 parents f6f1a9f + ea57bf0 commit 64b5fc7
Show file tree
Hide file tree
Showing 31 changed files with 228 additions and 206 deletions.
4 changes: 2 additions & 2 deletions .github/config/uncovered_files.csv
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ common/types/blob.cpp 3
common/types/chunk_collection.cpp 94
common/types/column/column_data_allocator.cpp 13
common/types/column/column_data_collection.cpp 50
common/types/column/partitioned_column_data.cpp 6
common/types/column/partitioned_column_data.cpp 8
common/types/conflict_info.cpp 2
common/types/conflict_manager.cpp 3
common/types/data_chunk.cpp 19
Expand Down Expand Up @@ -342,7 +342,7 @@ include/duckdb/common/operator/numeric_cast.hpp 2
include/duckdb/common/operator/subtract.hpp 2
include/duckdb/common/pipe_file_system.hpp 3
include/duckdb/common/radix.hpp 2
include/duckdb/common/radix_partitioning.hpp 3
include/duckdb/common/radix_partitioning.hpp 5
include/duckdb/common/re2_regex.hpp 21
include/duckdb/common/serializer.hpp 4
include/duckdb/common/serializer/format_deserializer.hpp 37
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/NightlyTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ concurrency:

env:
GH_TOKEN: ${{ secrets.GH_TOKEN }}
DUCKDB_WASM_VERSION: "0e27318"
DUCKDB_WASM_VERSION: "a8f2c38"
CCACHE_SAVE: ${{ github.repository != 'duckdb/duckdb' }}

jobs:
Expand Down Expand Up @@ -688,7 +688,7 @@ jobs:

- uses: mymindstorm/setup-emsdk@v12
with:
version: 3.1.41
version: 'latest'

- name: Setup
shell: bash
Expand Down
8 changes: 8 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,14 @@ function(build_loadable_extension NAME PARAMETERS)
build_loadable_extension_directory(${NAME} "extension/${NAME}" "${PARAMETERS}" ${FILES})
endfunction()

function(build_static_extension NAME PARAMETERS)
# all parameters after name
set(FILES ${ARGV})
list(REMOVE_AT FILES 0)
add_library(${NAME}_extension STATIC ${FILES})
target_link_libraries(${NAME}_extension duckdb_static)
endfunction()

# Internal extension register function
function(register_extension NAME DONT_LINK DONT_BUILD LOAD_TESTS PATH INCLUDE_PATH TEST_PATH)
string(TOLOWER ${NAME} EXTENSION_NAME_LOWERCASE)
Expand Down
2 changes: 1 addition & 1 deletion extension/autocomplete/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ include_directories(include)

set(AUTOCOMPLETE_EXTENSION_FILES autocomplete_extension.cpp)

add_library(autocomplete_extension STATIC ${AUTOCOMPLETE_EXTENSION_FILES})
build_static_extension(autocomplete ${AUTOCOMPLETE_EXTENSION_FILES})
set(PARAMETERS "-warnings")
build_loadable_extension(autocomplete ${PARAMETERS}
${AUTOCOMPLETE_EXTENSION_FILES})
Expand Down
3 changes: 1 addition & 2 deletions extension/excel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ include_directories(numformat/include)
include_directories(include)
add_subdirectory(numformat)

add_library(excel_extension STATIC excel_extension.cpp
${NUMFORMAT_OBJECT_FILES})
build_static_extension(excel excel_extension.cpp ${NUMFORMAT_OBJECT_FILES})
set(PARAMETERS "-warnings")
build_loadable_extension(excel ${PARAMETERS} excel_extension.cpp
${NUMFORMAT_OBJECT_FILES})
Expand Down
2 changes: 1 addition & 1 deletion extension/fts/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ set(FTS_SOURCES
../../third_party/snowball/src_c/stem_UTF_8_tamil.cpp
../../third_party/snowball/src_c/stem_UTF_8_turkish.cpp)

add_library(fts_extension STATIC ${FTS_SOURCES})
build_static_extension(fts ${FTS_SOURCES})
set(PARAMETERS "-warnings")
build_loadable_extension(fts ${PARAMETERS} ${FTS_SOURCES})

Expand Down
4 changes: 2 additions & 2 deletions extension/httpfs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ add_extension_definitions()

include_directories(include ../../third_party/httplib ../parquet/include)

add_library(httpfs_extension STATIC s3fs.cpp httpfs.cpp crypto.cpp
httpfs_extension.cpp)
build_static_extension(httpfs s3fs.cpp httpfs.cpp crypto.cpp
httpfs_extension.cpp)
set(PARAMETERS "-warnings")
build_loadable_extension(httpfs ${PARAMETERS} s3fs.cpp httpfs.cpp crypto.cpp
httpfs_extension.cpp)
Expand Down
2 changes: 1 addition & 1 deletion extension/icu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ set(ICU_EXTENSION_FILES
icu-timebucket.cpp
icu-timezone.cpp)

add_library(icu_extension STATIC ${ICU_EXTENSION_FILES})
build_static_extension(icu ${ICU_EXTENSION_FILES})
link_threads(icu_extension)
disable_target_warnings(icu_extension)
set(PARAMETERS "-no-warnings")
Expand Down
3 changes: 1 addition & 2 deletions extension/jemalloc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ add_subdirectory(jemalloc)

set(JEMALLOC_EXTENSION_FILES jemalloc_extension.cpp ${JEMALLOC_OBJECT_FILES})

add_library(jemalloc_extension STATIC ${JEMALLOC_EXTENSION_FILES})

build_static_extension(jemalloc ${JEMALLOC_EXTENSION_FILES})
# we do not do build_loadable_extension here because jemalloc is static-only

install(
Expand Down
2 changes: 1 addition & 1 deletion extension/json/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ set(JSON_EXTENSION_FILES
json_functions/read_json_objects.cpp
${YYJSON_OBJECT_FILES})

add_library(json_extension STATIC ${JSON_EXTENSION_FILES})
build_static_extension(json ${JSON_EXTENSION_FILES})
set(PARAMETERS "-warnings")
build_loadable_extension(json ${PARAMETERS} ${JSON_EXTENSION_FILES})

Expand Down
3 changes: 2 additions & 1 deletion extension/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ if(NOT CLANG_TIDY)
../../third_party/zstd/compress/zstd_opt.cpp)
endif()

add_library(parquet_extension STATIC ${PARQUET_EXTENSION_FILES})
build_static_extension(parquet ${PARQUET_EXTENSION_FILES})

set(PARAMETERS "-warnings")
build_loadable_extension(parquet ${PARAMETERS} ${PARQUET_EXTENSION_FILES})

Expand Down
2 changes: 1 addition & 1 deletion extension/sqlsmith/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ set(SQLSMITH_SOURCES
sqlsmith_extension.cpp statement_generator.cpp statement_simplifier.cpp
fuzzyduck.cpp ${SQLSMITH_OBJECT_FILES})

add_library(sqlsmith_extension STATIC ${SQLSMITH_SOURCES})
build_static_extension(sqlsmith ${SQLSMITH_SOURCES})
set(PARAMETERS "-warnings")
build_loadable_extension(sqlsmith ${PARAMETERS} ${SQLSMITH_SOURCES})

Expand Down
2 changes: 1 addition & 1 deletion extension/tpcds/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ include_directories(include)
include_directories(dsdgen/include)
add_subdirectory(dsdgen)

add_library(tpcds_extension STATIC tpcds_extension.cpp ${DSDGEN_OBJECT_FILES})
build_static_extension(tpcds tpcds_extension.cpp ${DSDGEN_OBJECT_FILES})
set(PARAMETERS "-warnings")
build_loadable_extension(tpcds ${PARAMETERS} tpcds_extension.cpp
${DSDGEN_OBJECT_FILES})
Expand Down
3 changes: 1 addition & 2 deletions extension/tpch/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@ include_directories(dbgen/include)
include_directories(include)
add_subdirectory(dbgen)

add_library(tpch_extension STATIC tpch_extension.cpp ${DBGEN_OBJECT_FILES})

build_static_extension(tpch tpch_extension.cpp ${DBGEN_OBJECT_FILES})
set(PARAMETERS "-warnings")
build_loadable_extension(tpch ${PARAMETERS} tpch_extension.cpp
${DBGEN_OBJECT_FILES})
Expand Down
2 changes: 1 addition & 1 deletion extension/visualizer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ project(VisualizerExtension)

include_directories(include)

add_library(visualizer_extension STATIC visualizer_extension.cpp)
build_static_extension(visualizer visualizer_extension.cpp)
set(PARAMETERS "-warnings")
build_loadable_extension(visualizer ${PARAMETERS} visualizer_extension.cpp)
install(
Expand Down
73 changes: 30 additions & 43 deletions src/common/sort/partition_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,22 @@ PartitionGlobalSinkState::PartitionGlobalSinkState(ClientContext &context,
const vector<unique_ptr<BaseStatistics>> &partition_stats,
idx_t estimated_cardinality)
: context(context), buffer_manager(BufferManager::GetBufferManager(context)), allocator(Allocator::Get(context)),
fixed_bits(0), payload_types(payload_types), memory_per_thread(0), count(0) {
fixed_bits(0), payload_types(payload_types), memory_per_thread(0), max_bits(1), count(0) {

GenerateOrderings(partitions, orders, partition_bys, order_bys, partition_stats);

memory_per_thread = PhysicalOperator::GetMaxThreadMemory(context);
external = ClientConfig::GetConfig(context).force_external;

const auto thread_pages = PreviousPowerOfTwo(memory_per_thread / (4 * idx_t(Storage::BLOCK_ALLOC_SIZE)));
while (max_bits < 10 && (thread_pages >> max_bits) > 1) {
++max_bits;
}

if (!orders.empty()) {
grouping_types = payload_types;
grouping_types.push_back(LogicalType::HASH);
auto types = payload_types;
types.push_back(LogicalType::HASH);
grouping_types.Initialize(types);

ResizeGroupingData(estimated_cardinality);
}
Expand All @@ -108,10 +114,15 @@ void PartitionGlobalSinkState::SyncPartitioning(const PartitionGlobalSinkState &
const auto old_bits = grouping_data ? grouping_data->GetRadixBits() : 0;
if (fixed_bits != old_bits) {
const auto hash_col_idx = payload_types.size();
grouping_data = make_uniq<RadixPartitionedColumnData>(context, grouping_types, fixed_bits, hash_col_idx);
grouping_data = make_uniq<RadixPartitionedTupleData>(buffer_manager, grouping_types, fixed_bits, hash_col_idx);
}
}

unique_ptr<RadixPartitionedTupleData> PartitionGlobalSinkState::CreatePartition(idx_t new_bits) const {
const auto hash_col_idx = payload_types.size();
return make_uniq<RadixPartitionedTupleData>(buffer_manager, grouping_types, new_bits, hash_col_idx);
}

void PartitionGlobalSinkState::ResizeGroupingData(idx_t cardinality) {
// Have we started to combine? Then just live with it.
if (fixed_bits || (grouping_data && !grouping_data->GetPartitions().empty())) {
Expand All @@ -121,47 +132,31 @@ void PartitionGlobalSinkState::ResizeGroupingData(idx_t cardinality) {
const idx_t partition_size = STANDARD_ROW_GROUPS_SIZE;
const auto bits = grouping_data ? grouping_data->GetRadixBits() : 0;
auto new_bits = bits ? bits : 4;
while (new_bits < 10 && (cardinality / RadixPartitioning::NumberOfPartitions(new_bits)) > partition_size) {
while (new_bits < max_bits && (cardinality / RadixPartitioning::NumberOfPartitions(new_bits)) > partition_size) {
++new_bits;
}

// Repartition the grouping data
if (new_bits != bits) {
const auto hash_col_idx = payload_types.size();
grouping_data = make_uniq<RadixPartitionedColumnData>(context, grouping_types, new_bits, hash_col_idx);
grouping_data = CreatePartition(new_bits);
}
}

void PartitionGlobalSinkState::SyncLocalPartition(GroupingPartition &local_partition, GroupingAppend &local_append) {
// We are done if the local_partition is right sized.
auto &local_radix = local_partition->Cast<RadixPartitionedColumnData>();
if (local_radix.GetRadixBits() == grouping_data->GetRadixBits()) {
auto &local_radix = local_partition->Cast<RadixPartitionedTupleData>();
const auto new_bits = grouping_data->GetRadixBits();
if (local_radix.GetRadixBits() == new_bits) {
return;
}

// If the local partition is now too small, flush it and reallocate
auto new_partition = grouping_data->CreateShared();
auto new_append = make_uniq<PartitionedColumnDataAppendState>();
new_partition->InitializeAppendState(*new_append);

auto new_partition = CreatePartition(new_bits);
local_partition->FlushAppendState(*local_append);
auto &local_groups = local_partition->GetPartitions();
for (auto &local_group : local_groups) {
ColumnDataScanState scanner;
local_group->InitializeScan(scanner);

DataChunk scan_chunk;
local_group->InitializeScanChunk(scan_chunk);
for (scan_chunk.Reset(); local_group->Scan(scanner, scan_chunk); scan_chunk.Reset()) {
new_partition->Append(*new_append, scan_chunk);
}
}

// The append state has stale pointers to the old local partition, so nuke it from orbit.
new_partition->FlushAppendState(*new_append);
local_partition->Repartition(*new_partition);

local_partition = std::move(new_partition);
local_append = make_uniq<PartitionedColumnDataAppendState>();
local_append = make_uniq<PartitionedTupleDataAppendState>();
local_partition->InitializeAppendState(*local_append);
}

Expand All @@ -170,8 +165,8 @@ void PartitionGlobalSinkState::UpdateLocalPartition(GroupingPartition &local_par
lock_guard<mutex> guard(lock);

if (!local_partition) {
local_partition = grouping_data->CreateShared();
local_append = make_uniq<PartitionedColumnDataAppendState>();
local_partition = CreatePartition(grouping_data->GetRadixBits());
local_append = make_uniq<PartitionedTupleDataAppendState>();
local_partition->InitializeAppendState(*local_append);
return;
}
Expand All @@ -196,7 +191,7 @@ void PartitionGlobalSinkState::CombineLocalPartition(GroupingPartition &local_pa
grouping_data->Combine(*local_partition);
}

void PartitionGlobalSinkState::BuildSortState(ColumnDataCollection &group_data, GlobalSortState &global_sort) const {
void PartitionGlobalSinkState::BuildSortState(TupleDataCollection &group_data, GlobalSortState &global_sort) const {
// Set up the sort expression computation.
vector<LogicalType> sort_types;
ExpressionExecutor executor(context);
Expand All @@ -221,30 +216,22 @@ void PartitionGlobalSinkState::BuildSortState(ColumnDataCollection &group_data,
for (column_t i = 0; i < payload_types.size(); ++i) {
column_ids.emplace_back(i);
}
ColumnDataConsumer scanner(group_data, column_ids);
ColumnDataConsumerScanState chunk_state;
chunk_state.current_chunk_state.properties = ColumnDataScanProperties::ALLOW_ZERO_COPY;
scanner.InitializeScan();
for (auto chunk_idx = scanner.ChunkCount(); chunk_idx-- > 0;) {
if (!scanner.AssignChunk(chunk_state)) {
break;
}
scanner.ScanChunk(chunk_state, payload_chunk);

TupleDataScanState chunk_state;
group_data.InitializeScan(chunk_state, column_ids);
while (group_data.Scan(chunk_state, payload_chunk)) {
sort_chunk.Reset();
executor.Execute(payload_chunk, sort_chunk);

local_sort.SinkChunk(sort_chunk, payload_chunk);
if (local_sort.SizeInBytes() > memory_per_thread) {
local_sort.Sort(global_sort, true);
}
scanner.FinishChunk(chunk_state);
}

global_sort.AddLocalState(local_sort);
}

void PartitionGlobalSinkState::BuildSortState(ColumnDataCollection &group_data, PartitionGlobalHashGroup &hash_group) {
void PartitionGlobalSinkState::BuildSortState(TupleDataCollection &group_data, PartitionGlobalHashGroup &hash_group) {
BuildSortState(group_data, *hash_group.global_sort);

hash_group.count += group_data.Count();
Expand Down
4 changes: 2 additions & 2 deletions src/include/duckdb/common/opener_file_system.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ class OpenerFileSystem : public FileSystem {
bool IsPipe(const string &filename) override {
return GetFileSystem().IsPipe(filename);
}
virtual void RemoveFile(const string &filename) override {
void RemoveFile(const string &filename) override {
GetFileSystem().RemoveFile(filename);
}

virtual vector<string> Glob(const string &path, FileOpener *opener = nullptr) override {
vector<string> Glob(const string &path, FileOpener *opener = nullptr) override {
if (opener) {
throw InternalException("OpenerFileSystem cannot take an opener - the opener is pushed automatically");
}
Expand Down
Loading

0 comments on commit 64b5fc7

Please sign in to comment.