Skip to content

Commit

Permalink
sample across multiple files
Browse files Browse the repository at this point in the history
  • Loading branch information
lnkuiper committed Sep 12, 2023
1 parent 312b995 commit 10e4204
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 8 deletions.
7 changes: 5 additions & 2 deletions extension/json/json_functions/read_json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
Vector string_vector(LogicalType::VARCHAR);

// Loop through the files (if union_by_name, else just sample the first file)
idx_t remaining = bind_data.sample_size;
for (idx_t file_idx = 0; file_idx < bind_data.files.size(); file_idx++) {
// Create global/local state and place the reader in the right field
JSONScanGlobalState gstate(context, bind_data);
Expand All @@ -28,7 +29,6 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
}

// Read and detect schema
idx_t remaining = bind_data.sample_size;
while (remaining != 0) {
allocator.Reset();
auto read_count = lstate.ReadNext(gstate);
Expand Down Expand Up @@ -56,7 +56,10 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
}

// Close the file and stop detection if not union_by_name
if (!bind_data.options.file_options.union_by_name) {
if (bind_data.options.file_options.union_by_name) {
// When union_by_name=true we sample sample_size per file
remaining = bind_data.sample_size;
} else if (remaining == 0) {
break;
}
}
Expand Down
10 changes: 4 additions & 6 deletions extension/json/json_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

#include "duckdb/common/enum_util.hpp"
#include "duckdb/common/multi_file_reader.hpp"
#include "duckdb/common/serializer/deserializer.hpp"
#include "duckdb/common/serializer/serializer.hpp"
#include "duckdb/main/extension_helper.hpp"
#include "duckdb/parallel/task_scheduler.hpp"
#include "duckdb/storage/buffer_manager.hpp"
#include "duckdb/common/serializer/serializer.hpp"
#include "duckdb/common/serializer/deserializer.hpp"

namespace duckdb {

Expand Down Expand Up @@ -558,10 +558,8 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
if (current_reader) {
// If we performed the final read of this reader in the previous iteration, close it now
if (is_last) {
if (gstate.bind_data.type != JSONScanType::SAMPLE) {
TryIncrementFileIndex(gstate);
current_reader->CloseJSONFile();
}
TryIncrementFileIndex(gstate);
current_reader->CloseJSONFile();
current_reader = nullptr;
continue;
}
Expand Down
20 changes: 20 additions & 0 deletions test/sql/json/table/json_multi_file_reader.test
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,23 @@ group by j
order by j;
----
[3] 5

# the JSON multi-file reader is a bit different, because we always sample sample_size
# even across multiple files when union_by_name=false
# there two files have a different schema, but we can read them together nonetheless
statement ok
SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'])

# both have 5 rows, so if we set sample_size=1, we cannot read them together anymore, because we only sample 1 file
statement error
SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'], sample_size=1)
----
Invalid Input Error

# if we set union_by_name=true, then we sample sample_size rows per file, so then we can read them again
statement ok
SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'], sample_size=1, union_by_name=true)

# with sample size 6 we sample 1 line from the second file, and of course we can read it again
statement ok
SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'], sample_size=6)

0 comments on commit 10e4204

Please sign in to comment.