sample across multiple files

lnkuiper · Sep 12, 2023 · 10e4204 · 10e4204
1 parent 312b995
commit 10e4204
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 8 deletions.
diff --git a/extension/json/json_functions/read_json.cpp b/extension/json/json_functions/read_json.cpp
@@ -17,6 +17,7 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
 	Vector string_vector(LogicalType::VARCHAR);
 
 	// Loop through the files (if union_by_name, else just sample the first file)
+	idx_t remaining = bind_data.sample_size;
 	for (idx_t file_idx = 0; file_idx < bind_data.files.size(); file_idx++) {
 		// Create global/local state and place the reader in the right field
 		JSONScanGlobalState gstate(context, bind_data);
@@ -28,7 +29,6 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
 		}
 
 		// Read and detect schema
-		idx_t remaining = bind_data.sample_size;
 		while (remaining != 0) {
 			allocator.Reset();
 			auto read_count = lstate.ReadNext(gstate);
@@ -56,7 +56,10 @@ void JSONScan::AutoDetect(ClientContext &context, JSONScanData &bind_data, vecto
 		}
 
 		// Close the file and stop detection if not union_by_name
-		if (!bind_data.options.file_options.union_by_name) {
+		if (bind_data.options.file_options.union_by_name) {
+			// When union_by_name=true we sample sample_size per file
+			remaining = bind_data.sample_size;
+		} else if (remaining == 0) {
 			break;
 		}
 	}

diff --git a/extension/json/json_scan.cpp b/extension/json/json_scan.cpp
@@ -2,11 +2,11 @@
 
 #include "duckdb/common/enum_util.hpp"
 #include "duckdb/common/multi_file_reader.hpp"
+#include "duckdb/common/serializer/deserializer.hpp"
+#include "duckdb/common/serializer/serializer.hpp"
 #include "duckdb/main/extension_helper.hpp"
 #include "duckdb/parallel/task_scheduler.hpp"
 #include "duckdb/storage/buffer_manager.hpp"
-#include "duckdb/common/serializer/serializer.hpp"
-#include "duckdb/common/serializer/deserializer.hpp"
 
 namespace duckdb {
 
@@ -558,10 +558,8 @@ bool JSONScanLocalState::ReadNextBuffer(JSONScanGlobalState &gstate) {
 		if (current_reader) {
 			// If we performed the final read of this reader in the previous iteration, close it now
 			if (is_last) {
-				if (gstate.bind_data.type != JSONScanType::SAMPLE) {
-					TryIncrementFileIndex(gstate);
-					current_reader->CloseJSONFile();
-				}
+				TryIncrementFileIndex(gstate);
+				current_reader->CloseJSONFile();
 				current_reader = nullptr;
 				continue;
 			}

diff --git a/test/sql/json/table/json_multi_file_reader.test b/test/sql/json/table/json_multi_file_reader.test
@@ -118,3 +118,23 @@ group by j
 order by j;
 ----
 [3]	5
+
+# the JSON multi-file reader is a bit different, because we always sample sample_size
+# even across multiple files when union_by_name=false
+# there two files have a different schema, but we can read them together nonetheless
+statement ok
+SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'])
+
+# both have 5 rows, so if we set sample_size=1, we cannot read them together anymore, because we only sample 1 file
+statement error
+SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'], sample_size=1)
+----
+Invalid Input Error
+
+# if we set union_by_name=true, then we sample sample_size rows per file, so then we can read them again
+statement ok
+SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'], sample_size=1, union_by_name=true)
+
+# with sample size 6 we sample 1 line from the second file, and of course we can read it again
+statement ok
+SELECT * FROM read_json_auto(['data/json/with_uuid.json', 'data/json/example_n.ndjson'], sample_size=6)