Skip to content

Commit

Permalink
Merge branch 'main' into fuzzer_stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
lnkuiper committed Sep 12, 2023
2 parents 001ea00 + 312b995 commit 37bf81c
Show file tree
Hide file tree
Showing 94 changed files with 1,472 additions and 764 deletions.
5 changes: 3 additions & 2 deletions .github/config/uncovered_files.csv
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ common/types/string_heap.cpp 11
common/types/time.cpp 25
common/types/timestamp.cpp 19
common/types/uuid.cpp 3
common/types/validity_mask.cpp 12
common/types/validity_mask.cpp 13
common/types/value.cpp 180
common/types/vector.cpp 153
common/value_operations/comparison_operations.cpp 32
Expand Down Expand Up @@ -769,7 +769,7 @@ storage/checkpoint/row_group_writer.cpp 6
storage/checkpoint/table_data_writer.cpp 3
storage/checkpoint/write_overflow_strings_to_disk.cpp 5
storage/checkpoint_manager.cpp 9
storage/compression/bitpacking.cpp 19
storage/compression/bitpacking.cpp 23
storage/compression/dictionary_compression.cpp 2
storage/compression/fsst.cpp 11
storage/compression/numeric_constant.cpp 11
Expand Down Expand Up @@ -807,6 +807,7 @@ storage/table/column_segment.cpp 62
storage/table/list_column_data.cpp 28
storage/table/row_group.cpp 54
storage/table/row_group_collection.cpp 7
storage/table/row_version_manager.cpp 6
storage/table/scan_state.cpp 7
storage/table/standard_column_data.cpp 2
storage/table/struct_column_data.cpp 24
Expand Down
101 changes: 101 additions & 0 deletions .github/workflows/Wasm.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# This is a reusable workflow to be used by extensions for CI. It:
# - builds the extension using the CI workflow from the corresponding DuckDB version
# - runs the tests using `make test` in the extension repository
# - (optionally) deploys the binaries to S3

name: DuckDB-Wasm extensions
on:
workflow_dispatch:
inputs:
# Git ref of the duckdb-wasm repo
duckdb-wasm-ref:
required: false
type: string
default: main
# Git ref of the duckdb repo
duckdb-ref:
required: true
type: string
# Publish extensions on extensions.duckdb.org?
release_s3:
required: false
type: boolean
default: false

env:
GH_TOKEN: ${{ secrets.GH_TOKEN }}

jobs:
build_and_publish:
name: Build and publish
runs-on: ubuntu-latest
strategy:
matrix:
duckdb_wasm_arch: [ 'mvp', 'eh' ]
env:
GEN: Ninja
VCPKG_TOOLCHAIN_PATH: ${{ github.workspace }}/vcpkg/scripts/buildsystems/vcpkg.cmake
DUCKDB_PLATFORM: "wasm_${{ matrix.duckdb_wasm_arch }}"

steps:
- uses: actions/checkout@v3
with:
submodules: 'recursive'
repository: 'duckdb/duckdb-wasm'
ref: ${{ github.event.inputs.duckdb-wasm-ref }}
fetch-depth: 0

- name: Prepare repository
run: |
[ -f duckdb.patch ] && cd submodules/duckdb && git apply ../../duckdb.patch || echo "No patching needed"
- uses: mymindstorm/setup-emsdk@v12
with:
version: 'latest'
- name: Setup vcpkg
uses: lukka/run-vcpkg@v11
with:
vcpkgGitCommitId: 501db0f17ef6df184fcdbfbe0f87cde2313b6ab1

- name: Setup Ccache
uses: hendrikmuhs/ccache-action@main
with:
key: ${{ github.job }}

- name: Git submodule status
run: |
brotli --version
git submodule status > git_submodule_status.txt
- name: Build Wasm module
run: |
cp .github/config/extension_config_wasm.cmake submodules/duckdb/extension/extension_config.cmake
DUCKDB_WASM_LOADABLE_EXTENSIONS="signed" GEN=ninja ./scripts/wasm_build_lib.sh relsize ${{ matrix.duckdb_wasm_arch }}
bash ./scripts/build_loadable.sh relsize ${{ matrix.duckdb_wasm_arch }}
- uses: actions/checkout@v3
with:
path: 'duckdb'

- name: Sign and deploy Wasm extensions
if: ${{ github.event.inputs.release_s3 }}
env:
DUCKDB_EXTENSION_SIGNING_PK: ${{ secrets.DUCKDB_EXTENSION_SIGNING_PK }}
run: |
bash ./duckdb/scripts/extension-upload-wasm.sh ${{ env.DUCKDB_PLATFORM }} ${{ github.event.inputs.duckdb-ref }}
- name: Upload artifact
uses: actions/upload-artifact@v3
with:
name: duckdb-wasm-${{ matrix.duckdb_wasm_arch }}
path: loadable_extensions/
retention-days: 1

trigger_github_pages_build:
name: Trigger follow-up work
runs-on: ubuntu-latest
needs: build_and_publish
steps:
- name: Move control to duckdb-wasm
run: |
curl -XPOST -u "${{secrets.PAT_USER}}:${{secrets.PAT_TOKEN}}" -H "Accept: application/vnd.github.everest-preview+json" -H "Content-Type: application/json" https://api.github.com/repos/duckdb/duckdb-wasm/actions/workflows/main.yml/dispatches --data '{"ref":"${{ github.event.inputs.duckdb-wasm-ref}}"}'
5 changes: 5 additions & 0 deletions extension/httpfs/httpfs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,11 @@ void HTTPFileSystem::Seek(FileHandle &handle, idx_t location) {
sfh.file_offset = location;
}

idx_t HTTPFileSystem::SeekPosition(FileHandle &handle) {
auto &sfh = (HTTPFileHandle &)handle;
return sfh.file_offset;
}

// Get either the local, global, or no cache depending on settings
static optional_ptr<HTTPMetadataCache> TryGetMetadataCache(FileOpener *opener, HTTPFileSystem &httpfs) {
auto client_context = FileOpener::TryGetClientContext(opener);
Expand Down
34 changes: 23 additions & 11 deletions extension/httpfs/include/http_metadata_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,37 +28,49 @@ class HTTPMetadataCache : public ClientContextState {
void Insert(const string &path, HTTPMetadataCacheEntry val) {
if (shared) {
lock_guard<mutex> parallel_lock(lock);
map[path] = val;
} else {
map[path] = val;
}
map[path] = val;
};

void Erase(string path) {
if (shared) {
lock_guard<mutex> parallel_lock(lock);
map.erase(path);
} else {
map.erase(path);
}
map.erase(path);
};

bool Find(string path, HTTPMetadataCacheEntry &ret_val) {
if (shared) {
lock_guard<mutex> parallel_lock(lock);
}
auto lookup = map.find(path);
if (lookup != map.end()) {
ret_val = lookup->second;
return true;
auto lookup = map.find(path);
if (lookup != map.end()) {
ret_val = lookup->second;
return true;
} else {
return false;
}
} else {
return false;
auto lookup = map.find(path);
if (lookup != map.end()) {
ret_val = lookup->second;
return true;
} else {
return false;
}
}

return false;
};

void Clear() {
if (shared) {
lock_guard<mutex> parallel_lock(lock);
map.clear();
} else {
map.clear();
}
map.clear();
}

//! Called by the ClientContext when the current query ends
Expand Down
1 change: 1 addition & 0 deletions extension/httpfs/include/httpfs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ class HTTPFileSystem : public FileSystem {
time_t GetLastModifiedTime(FileHandle &handle) override;
bool FileExists(const string &filename) override;
void Seek(FileHandle &handle, idx_t location) override;
idx_t SeekPosition(FileHandle &handle) override;
bool CanHandleFile(const string &fpath) override;
bool CanSeek() override {
return true;
Expand Down
2 changes: 1 addition & 1 deletion extension/parquet/parquet_extension.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ struct ParquetWriteBindData : public TableFunctionData {
vector<LogicalType> sql_types;
vector<string> column_names;
duckdb_parquet::format::CompressionCodec::type codec = duckdb_parquet::format::CompressionCodec::SNAPPY;
idx_t row_group_size = RowGroup::ROW_GROUP_SIZE;
idx_t row_group_size = Storage::ROW_GROUP_SIZE;

//! If row_group_size_bytes is not set, we default to row_group_size * BYTES_PER_ROW
static constexpr const idx_t BYTES_PER_ROW = 1024;
Expand Down
42 changes: 42 additions & 0 deletions scripts/extension-upload-wasm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash

# Usage: ./extension-upload-wasm.sh <architecture> <commithash or version_tag>

set -e

echo "$DUCKDB_EXTENSION_SIGNING_PK" > private.pem

FILES="loadable_extensions/*.duckdb_extension.wasm"
for f in $FILES
do
ext=`basename $f .duckdb_extension.wasm`
echo $ext
# calculate SHA256 hash of extension binary
cat $f > $f.append
# 0 for custom section
# 113 in hex = 275 in decimal, total lenght of what follows (1 + 16 + 2 + 256)
# [1(continuation) + 0010011(payload) = \x93, 0(continuation) + 10(payload) = \x02]
echo -n -e '\x00' >> $f.append
echo -n -e '\x93\x02' >> $f.append
# 10 in hex = 16 in decimal, lenght of name, 1 byte
echo -n -e '\x10' >> $f.append
echo -n -e 'duckdb_signature' >> $f.append
# the name of the WebAssembly custom section, 16 bytes
# 100 in hex, 256 in decimal
# [1(continuation) + 0000000(payload) = ff, 0(continuation) + 10(payload)],
# for a grand total of 2 bytes
echo -n -e '\x80\x02' >> $f.append
# the actual payload, 256 bytes, to be added later
openssl dgst -binary -sha256 $f.append > $f.hash
# encrypt hash with extension signing private key to create signature
openssl pkeyutl -sign -in $f.hash -inkey private.pem -pkeyopt digest:sha256 -out $f.sign
# append signature to extension binary
cat $f.sign >> $f.append
# compress extension binary
brotli < $f.append > "$f.brotli"
# upload compressed extension binary to S3
aws s3 cp $f.brotli s3://test-duckdb-wasm-extensions/duckdb-wasm/$2/$1/$ext.duckdb_extension.wasm --acl public-read --content-encoding br --content-type="application/wasm"
done

rm private.pem

5 changes: 5 additions & 0 deletions src/common/enum_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,8 @@ BindingMode EnumUtil::FromString<BindingMode>(const char *value) {
template<>
const char* EnumUtil::ToChars<BitpackingMode>(BitpackingMode value) {
switch(value) {
case BitpackingMode::INVALID:
return "INVALID";
case BitpackingMode::AUTO:
return "AUTO";
case BitpackingMode::CONSTANT:
Expand All @@ -568,6 +570,9 @@ const char* EnumUtil::ToChars<BitpackingMode>(BitpackingMode value) {

template<>
BitpackingMode EnumUtil::FromString<BitpackingMode>(const char *value) {
if (StringUtil::Equals(value, "INVALID")) {
return BitpackingMode::INVALID;
}
if (StringUtil::Equals(value, "AUTO")) {
return BitpackingMode::AUTO;
}
Expand Down
2 changes: 1 addition & 1 deletion src/common/file_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include "duckdb/common/exception.hpp"
#include "duckdb/common/file_system.hpp"
#include "duckdb/common/helper.hpp"

#include "duckdb/storage/storage_info.hpp"
#include <cstring>

namespace duckdb {
Expand Down
56 changes: 56 additions & 0 deletions src/common/types/validity_mask.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#include "duckdb/common/types/validity_mask.hpp"
#include "duckdb/common/limits.hpp"
#include "duckdb/common/serializer/write_stream.hpp"
#include "duckdb/common/serializer/read_stream.hpp"

namespace duckdb {

Expand Down Expand Up @@ -173,4 +176,57 @@ void ValidityMask::SliceInPlace(const ValidityMask &other, idx_t target_offset,
#endif
}

enum class ValiditySerialization : uint8_t { BITMASK = 0, VALID_VALUES = 1, INVALID_VALUES = 2 };

void ValidityMask::Write(WriteStream &writer, idx_t count) {
auto valid_values = CountValid(count);
auto invalid_values = count - valid_values;
auto bitmask_bytes = ValidityMask::ValidityMaskSize(count);
auto need_u32 = count >= NumericLimits<uint16_t>::Maximum();
auto bytes_per_value = need_u32 ? sizeof(uint32_t) : sizeof(uint16_t);
auto valid_value_size = bytes_per_value * valid_values + sizeof(uint32_t);
auto invalid_value_size = bytes_per_value * invalid_values + sizeof(uint32_t);
if (valid_value_size < bitmask_bytes || invalid_value_size < bitmask_bytes) {
auto serialize_valid = valid_value_size < invalid_value_size;
// serialize (in)valid value indexes as [COUNT][V0][V1][...][VN]
auto flag = serialize_valid ? ValiditySerialization::VALID_VALUES : ValiditySerialization::INVALID_VALUES;
writer.Write(flag);
writer.Write<uint32_t>(MinValue<uint32_t>(valid_values, invalid_values));
for (idx_t i = 0; i < count; i++) {
if (RowIsValid(i) == serialize_valid) {
if (need_u32) {
writer.Write<uint32_t>(i);
} else {
writer.Write<uint16_t>(i);
}
}
}
} else {
// serialize the entire bitmask
writer.Write(ValiditySerialization::BITMASK);
writer.WriteData(const_data_ptr_cast(GetData()), bitmask_bytes);
}
}

void ValidityMask::Read(ReadStream &reader, idx_t count) {
Initialize(count);
// deserialize the storage type
auto flag = reader.Read<ValiditySerialization>();
if (flag == ValiditySerialization::BITMASK) {
// deserialize the bitmask
reader.ReadData(data_ptr_cast(GetData()), ValidityMask::ValidityMaskSize(count));
return;
}
auto is_u32 = count >= NumericLimits<uint16_t>::Maximum();
auto is_valid = flag == ValiditySerialization::VALID_VALUES;
auto serialize_count = reader.Read<uint32_t>();
if (is_valid) {
SetAllInvalid(count);
}
for (idx_t i = 0; i < serialize_count; i++) {
idx_t index = is_u32 ? reader.Read<uint32_t>() : reader.Read<uint16_t>();
Set(index, is_valid);
}
}

} // namespace duckdb
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ class BatchInsertGlobalState : public GlobalSinkState {
}
auto new_count = current_collection->GetTotalRows();
auto batch_type =
new_count < RowGroup::ROW_GROUP_SIZE ? RowGroupBatchType::NOT_FLUSHED : RowGroupBatchType::FLUSHED;
new_count < Storage::ROW_GROUP_SIZE ? RowGroupBatchType::NOT_FLUSHED : RowGroupBatchType::FLUSHED;
if (batch_type == RowGroupBatchType::FLUSHED && writer) {
writer->WriteLastRowGroup(*current_collection);
}
Expand Down
2 changes: 1 addition & 1 deletion src/execution/operator/persistent/physical_insert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ SinkCombineResultType PhysicalInsert::Combine(ExecutionContext &context, Operato

lock_guard<mutex> lock(gstate.lock);
gstate.insert_count += append_count;
if (append_count < RowGroup::ROW_GROUP_SIZE) {
if (append_count < Storage::ROW_GROUP_SIZE) {
// we have few rows - append to the local storage directly
auto &table = gstate.table;
auto &storage = table.GetStorage();
Expand Down
10 changes: 9 additions & 1 deletion src/function/table/arrow_conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -837,7 +837,15 @@ void ArrowTableFunction::ArrowToDuckDB(ArrowScanLocalState &scan_state, const ar
throw InvalidInputException("arrow_scan: array length mismatch");
}
// Make sure this Vector keeps the Arrow chunk alive in case we can zero-copy the data
output.data[idx].GetBuffer()->SetAuxiliaryData(make_uniq<ArrowAuxiliaryData>(scan_state.chunk));
if (scan_state.arrow_owned_data.find(idx) == scan_state.arrow_owned_data.end()) {
auto arrow_data = make_shared<ArrowArrayWrapper>();
arrow_data->arrow_array = scan_state.chunk->arrow_array;
scan_state.chunk->arrow_array.release = nullptr;
scan_state.arrow_owned_data[idx] = arrow_data;
}

output.data[idx].GetBuffer()->SetAuxiliaryData(make_uniq<ArrowAuxiliaryData>(scan_state.arrow_owned_data[idx]));

D_ASSERT(arrow_convert_data.find(col_idx) != arrow_convert_data.end());
auto &arrow_type = *arrow_convert_data.at(col_idx);
if (array.dictionary) {
Expand Down
Loading

0 comments on commit 37bf81c

Please sign in to comment.