From 0357a33c26a60d20923ce596dd98bcd04f9a03e1 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 21 Feb 2024 11:05:05 -0300 Subject: [PATCH 001/147] Get rid of rejects_recovery_columns --- .../table_function/csv_file_scanner.cpp | 14 ------ .../table_function/global_csv_state.cpp | 15 ------- .../csv_scanner/util/csv_reader_options.cpp | 7 --- .../operator/persistent/csv_rejects_table.cpp | 9 ---- src/function/table/read_csv.cpp | 23 +--------- .../csv_scanner/csv_reader_options.hpp | 4 -- .../duckdb/storage/serialization/nodes.json | 26 ++++------- src/storage/serialization/serialize_nodes.cpp | 44 +++++++++---------- 8 files changed, 30 insertions(+), 112 deletions(-) diff --git a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp index 27293e0736cb..7bdc4712fc02 100644 --- a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp +++ b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp @@ -168,20 +168,6 @@ void CSVFileScan::InitializeFileNamesTypes() { projection_ids.emplace_back(result_idx, i); } - if (!projected_columns.empty()) { - // We might have to add recovery rejects column ids - for (idx_t i = 0; i < options.rejects_recovery_column_ids.size(); i++) { - idx_t col_id = options.rejects_recovery_column_ids[i]; - if (projected_columns.find(col_id) == projected_columns.end()) { - // We have to insert this column in our projection - projected_columns.insert(col_id); - file_types.emplace_back(LogicalType::VARCHAR); - projected_columns.insert(col_id); - projection_ids.emplace_back(col_id, col_id); - } - } - } - if (reader_data.column_ids.empty()) { file_types = types; } diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 764c7b057053..2f8e92e3718f 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -174,21 +174,6 @@ void CSVGlobalState::FillRejectsTable() { appender.Append(string_t("\"" + col_name + "\"")); appender.Append(error.row[col_idx]); - if (!options.rejects_recovery_columns.empty()) { - child_list_t recovery_key; - for (auto &key_idx : options.rejects_recovery_column_ids) { - // Figure out if the recovery key is valid. - // If not, error out for real. - auto &value = error.row[key_idx]; - if (value.IsNull()) { - throw InvalidInputException("%s at line %llu in column %s. Parser options:\n%s ", - "Could not parse recovery column", row_line, col_name, - options.ToString()); - } - recovery_key.emplace_back(bind_data.return_names[key_idx], value); - } - appender.Append(Value::STRUCT(recovery_key)); - } auto row_error_msg = StringUtil::Format("Could not convert string '%s' to '%s'", error.row[col_idx].ToString(), file->types[col_idx].ToString()); diff --git a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp index d05db0bb8fb2..72c73a2e5bac 100644 --- a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +++ b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp @@ -213,13 +213,6 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, throw BinderException("REJECTS_TABLE option cannot be empty"); } rejects_table_name = table_name; - } else if (loption == "rejects_recovery_columns") { - // Get the list of columns to use as a recovery key - auto &children = ListValue::GetChildren(value); - for (auto &child : children) { - auto col_name = child.GetValue(); - rejects_recovery_columns.push_back(col_name); - } } else if (loption == "rejects_limit") { int64_t limit = ParseInteger(value, loption); if (limit < 0) { diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index a96bb2251aa4..7d01723a7718 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -29,15 +29,6 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData info->columns.AddColumn(ColumnDefinition("column", LogicalType::BIGINT)); info->columns.AddColumn(ColumnDefinition("column_name", LogicalType::VARCHAR)); info->columns.AddColumn(ColumnDefinition("parsed_value", LogicalType::VARCHAR)); - - if (!data.options.rejects_recovery_columns.empty()) { - child_list_t recovery_key_components; - for (auto &col_name : data.options.rejects_recovery_columns) { - recovery_key_components.emplace_back(col_name, LogicalType::VARCHAR); - } - info->columns.AddColumn(ColumnDefinition("recovery_columns", LogicalType::STRUCT(recovery_key_components))); - } - info->columns.AddColumn(ColumnDefinition("error", LogicalType::VARCHAR)); catalog.CreateTable(context, std::move(info)); diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index e7c3edb27276..1b8c2ffe7478 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -41,22 +41,6 @@ ReadCSVData::ReadCSVData() { void ReadCSVData::FinalizeRead(ClientContext &context) { BaseCSVData::Finalize(); - if (!options.rejects_recovery_columns.empty()) { - for (auto &recovery_col : options.rejects_recovery_columns) { - bool found = false; - for (idx_t col_idx = 0; col_idx < return_names.size(); col_idx++) { - if (StringUtil::CIEquals(return_names[col_idx], recovery_col)) { - options.rejects_recovery_column_ids.push_back(col_idx); - found = true; - break; - } - } - if (!found) { - throw BinderException("Unsupported parameter for REJECTS_RECOVERY_COLUMNS: column \"%s\" not found", - recovery_col); - } - } - } } static unique_ptr ReadCSVBind(ClientContext &context, TableFunctionBindInput &input, @@ -84,11 +68,6 @@ static unique_ptr ReadCSVBind(ClientContext &context, TableFunctio } } - if (!options.rejects_recovery_columns.empty() && options.rejects_table_name.empty()) { - throw BinderException( - "REJECTS_RECOVERY_COLUMNS option is only supported when REJECTS_TABLE is set to a table name"); - } - options.file_options.AutoDetectHivePartitioning(result->files, context); if (!options.auto_detect && return_types.empty()) { @@ -143,7 +122,7 @@ static unique_ptr ReadCSVBind(ClientContext &context, TableFunctio result->return_types = return_types; result->return_names = names; - result->FinalizeRead(context); + result->Finalize(); return std::move(result); } diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp index 6aa349bf823e..ee06436ed9d6 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp @@ -45,10 +45,6 @@ struct CSVReaderOptions { string rejects_table_name; //! Rejects table entry limit (0 = no limit) idx_t rejects_limit = 0; - //! Columns to use as recovery key for rejected rows when reading with ignore_errors = true - vector rejects_recovery_columns; - //! Index of the recovery columns - vector rejects_recovery_column_ids; //! Number of samples to buffer idx_t buffer_sample_size = (idx_t)STANDARD_VECTOR_SIZE * 50; //! Specifies the string that represents a null value diff --git a/src/include/duckdb/storage/serialization/nodes.json b/src/include/duckdb/storage/serialization/nodes.json index 97fda8405fdc..fadbc480cb22 100644 --- a/src/include/duckdb/storage/serialization/nodes.json +++ b/src/include/duckdb/storage/serialization/nodes.json @@ -584,50 +584,42 @@ "type": "idx_t" }, {"id": 119, - "name": "rejects_recovery_columns", - "type": "vector" - }, - {"id": 120, - "name": "rejects_recovery_column_ids", - "type": "vector" - }, - {"id": 121, "name": "dialect_options.state_machine_options.delimiter", "type": "CSVOption" }, - {"id": 122, + {"id": 120, "name": "dialect_options.state_machine_options.quote", "type": "CSVOption" }, - {"id": 123, + {"id": 121, "name": "dialect_options.state_machine_options.escape", "type": "CSVOption" }, - {"id": 124, + {"id": 122, "name": "dialect_options.header", "type": "CSVOption" }, - {"id": 125, + {"id": 123, "name": "dialect_options.num_cols", "type": "idx_t" }, - {"id": 126, + {"id": 124, "name": "dialect_options.state_machine_options.new_line", "type": "CSVOption" }, - {"id": 127, + {"id": 125, "name": "dialect_options.skip_rows", "type": "CSVOption" }, - {"id": 128, + {"id": 126, "name": "dialect_options.date_format", "type": "map>" }, - {"id": 129, + {"id": 127, "name": "sniffer_user_mismatch_error", "type": "string" }, - {"id": 130, + {"id": 128, "name": "parallel", "type": "bool" } diff --git a/src/storage/serialization/serialize_nodes.cpp b/src/storage/serialization/serialize_nodes.cpp index 13df905c94f4..714a465f4b92 100644 --- a/src/storage/serialization/serialize_nodes.cpp +++ b/src/storage/serialization/serialize_nodes.cpp @@ -121,18 +121,16 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault>(116, "force_quote", force_quote); serializer.WritePropertyWithDefault(117, "rejects_table_name", rejects_table_name); serializer.WritePropertyWithDefault(118, "rejects_limit", rejects_limit); - serializer.WritePropertyWithDefault>(119, "rejects_recovery_columns", rejects_recovery_columns); - serializer.WritePropertyWithDefault>(120, "rejects_recovery_column_ids", rejects_recovery_column_ids); - serializer.WriteProperty>(121, "dialect_options.state_machine_options.delimiter", dialect_options.state_machine_options.delimiter); - serializer.WriteProperty>(122, "dialect_options.state_machine_options.quote", dialect_options.state_machine_options.quote); - serializer.WriteProperty>(123, "dialect_options.state_machine_options.escape", dialect_options.state_machine_options.escape); - serializer.WriteProperty>(124, "dialect_options.header", dialect_options.header); - serializer.WritePropertyWithDefault(125, "dialect_options.num_cols", dialect_options.num_cols); - serializer.WriteProperty>(126, "dialect_options.state_machine_options.new_line", dialect_options.state_machine_options.new_line); - serializer.WriteProperty>(127, "dialect_options.skip_rows", dialect_options.skip_rows); - serializer.WriteProperty>>(128, "dialect_options.date_format", dialect_options.date_format); - serializer.WritePropertyWithDefault(129, "sniffer_user_mismatch_error", sniffer_user_mismatch_error); - serializer.WritePropertyWithDefault(130, "parallel", parallel); + serializer.WriteProperty>(119, "dialect_options.state_machine_options.delimiter", dialect_options.state_machine_options.delimiter); + serializer.WriteProperty>(120, "dialect_options.state_machine_options.quote", dialect_options.state_machine_options.quote); + serializer.WriteProperty>(121, "dialect_options.state_machine_options.escape", dialect_options.state_machine_options.escape); + serializer.WriteProperty>(122, "dialect_options.header", dialect_options.header); + serializer.WritePropertyWithDefault(123, "dialect_options.num_cols", dialect_options.num_cols); + serializer.WriteProperty>(124, "dialect_options.state_machine_options.new_line", dialect_options.state_machine_options.new_line); + serializer.WriteProperty>(125, "dialect_options.skip_rows", dialect_options.skip_rows); + serializer.WriteProperty>>(126, "dialect_options.date_format", dialect_options.date_format); + serializer.WritePropertyWithDefault(127, "sniffer_user_mismatch_error", sniffer_user_mismatch_error); + serializer.WritePropertyWithDefault(128, "parallel", parallel); } CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { @@ -156,18 +154,16 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { deserializer.ReadPropertyWithDefault>(116, "force_quote", result.force_quote); deserializer.ReadPropertyWithDefault(117, "rejects_table_name", result.rejects_table_name); deserializer.ReadPropertyWithDefault(118, "rejects_limit", result.rejects_limit); - deserializer.ReadPropertyWithDefault>(119, "rejects_recovery_columns", result.rejects_recovery_columns); - deserializer.ReadPropertyWithDefault>(120, "rejects_recovery_column_ids", result.rejects_recovery_column_ids); - deserializer.ReadProperty>(121, "dialect_options.state_machine_options.delimiter", result.dialect_options.state_machine_options.delimiter); - deserializer.ReadProperty>(122, "dialect_options.state_machine_options.quote", result.dialect_options.state_machine_options.quote); - deserializer.ReadProperty>(123, "dialect_options.state_machine_options.escape", result.dialect_options.state_machine_options.escape); - deserializer.ReadProperty>(124, "dialect_options.header", result.dialect_options.header); - deserializer.ReadPropertyWithDefault(125, "dialect_options.num_cols", result.dialect_options.num_cols); - deserializer.ReadProperty>(126, "dialect_options.state_machine_options.new_line", result.dialect_options.state_machine_options.new_line); - deserializer.ReadProperty>(127, "dialect_options.skip_rows", result.dialect_options.skip_rows); - deserializer.ReadProperty>>(128, "dialect_options.date_format", result.dialect_options.date_format); - deserializer.ReadPropertyWithDefault(129, "sniffer_user_mismatch_error", result.sniffer_user_mismatch_error); - deserializer.ReadPropertyWithDefault(130, "parallel", result.parallel); + deserializer.ReadProperty>(119, "dialect_options.state_machine_options.delimiter", result.dialect_options.state_machine_options.delimiter); + deserializer.ReadProperty>(120, "dialect_options.state_machine_options.quote", result.dialect_options.state_machine_options.quote); + deserializer.ReadProperty>(121, "dialect_options.state_machine_options.escape", result.dialect_options.state_machine_options.escape); + deserializer.ReadProperty>(122, "dialect_options.header", result.dialect_options.header); + deserializer.ReadPropertyWithDefault(123, "dialect_options.num_cols", result.dialect_options.num_cols); + deserializer.ReadProperty>(124, "dialect_options.state_machine_options.new_line", result.dialect_options.state_machine_options.new_line); + deserializer.ReadProperty>(125, "dialect_options.skip_rows", result.dialect_options.skip_rows); + deserializer.ReadProperty>>(126, "dialect_options.date_format", result.dialect_options.date_format); + deserializer.ReadPropertyWithDefault(127, "sniffer_user_mismatch_error", result.sniffer_user_mismatch_error); + deserializer.ReadPropertyWithDefault(128, "parallel", result.parallel); return result; } From 62d8dec545bc22e4dda08fb980b83ae1563be082 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 21 Feb 2024 11:06:39 -0300 Subject: [PATCH 002/147] pesky bee --- src/function/table/read_csv.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index 1b8c2ffe7478..272c5f95a6b0 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -230,7 +230,6 @@ void ReadCSVTableFunction::ReadCSVAddNamedParameters(TableFunction &table_functi table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN; table_function.named_parameters["rejects_table"] = LogicalType::VARCHAR; table_function.named_parameters["rejects_limit"] = LogicalType::BIGINT; - table_function.named_parameters["rejects_recovery_columns"] = LogicalType::LIST(LogicalType::VARCHAR); table_function.named_parameters["buffer_size"] = LogicalType::UBIGINT; table_function.named_parameters["decimal_separator"] = LogicalType::VARCHAR; table_function.named_parameters["parallel"] = LogicalType::BOOLEAN; From e7bfcd62104696b2e40a44346c007207646e9af2 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 22 Feb 2024 10:13:07 -0300 Subject: [PATCH 003/147] wip commit --- .../scanner/string_value_scanner.cpp | 40 ++++++++++++------- .../operator/csv_scanner/util/csv_error.cpp | 9 ++--- .../operator/csv_scanner/csv_error.hpp | 8 ++-- .../csv_scanner/string_value_scanner.hpp | 2 + 4 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 349168d74949..c43ee62d1d6e 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -322,6 +322,24 @@ void StringValueResult::NullPaddingQuotedNewlineCheck() { } } +//! Reconstructs the current line to be used in error messages +string StringValueResult::ReconstructCurrentLine(){ + LinePosition current_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, + buffer_size}; + idx_t current_line_size = current_line_start - previous_line_start; + string result; + result.resize(current_line_size); + if (iterator.pos.buffer_idx == previous_line_start.buffer_idx){ + idx_t result_idx = 0; + for (idx_t i = previous_line_start.buffer_pos; i < iterator.pos.buffer_pos; i ++){ + result[result_idx++] = buffer_ptr[i]; + } + } else{ + throw InternalException("Oh no"); + } + return result; +} + bool StringValueResult::AddRowInternal() { if (ignore_current_row) { // An error occurred on this row, we are ignoring it and resetting our control flag @@ -330,17 +348,6 @@ bool StringValueResult::AddRowInternal() { } if (!cast_errors.empty()) { // A wild casting error appears - // Recreate row for rejects-table - vector row; - if (!state_machine.options.rejects_table_name.empty()) { - for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) { - if (cast_errors.find(col) != cast_errors.end()) { - row.push_back(cast_errors[col]); - } else { - row.push_back(parse_chunk.data[col].GetValue(number_of_rows)); - } - } - } for (auto &cast_error : cast_errors) { std::ostringstream error; // Casting Error Message @@ -348,9 +355,9 @@ bool StringValueResult::AddRowInternal() { << LogicalTypeIdToString(parse_types[cast_error.first]) << "\'"; auto error_string = error.str(); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - 1); - + auto borked_line = ReconstructCurrentLine(); auto csv_error = CSVError::CastError(state_machine.options, names[cast_error.first], error_string, - cast_error.first, row, lines_per_batch); + cast_error.first, borked_line, lines_per_batch); error_handler.Error(csv_error); } // If we got here it means we are ignoring errors, hence we need to signify to our result scanner to ignore this @@ -615,8 +622,10 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { } LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - parse_chunk.size() + line_error); +// auto borked_line = result.ReconstructCurrentLine(); + string empty; auto csv_error = CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], - error_message, col_idx, row, lines_per_batch); + error_message, col_idx, empty, lines_per_batch); error_handler->Error(csv_error); } borked_lines.insert(line_error++); @@ -632,8 +641,9 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { } LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - parse_chunk.size() + line_error); + string empty; auto csv_error = CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], - error_message, col_idx, row, lines_per_batch); + error_message, col_idx, empty, lines_per_batch); error_handler->Error(csv_error); } diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index 7867421d57d1..c96893ed4cc3 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -76,9 +76,9 @@ CSVError::CSVError(string error_message_p, CSVErrorType type_p, LinesPerBoundary : error_message(std::move(error_message_p)), type(type_p), error_info(error_info_p) { } -CSVError::CSVError(string error_message_p, CSVErrorType type_p, idx_t column_idx_p, vector row_p, +CSVError::CSVError(string error_message_p, CSVErrorType type_p, idx_t column_idx_p, string csv_row_p, LinesPerBoundary error_info_p) - : error_message(std::move(error_message_p)), type(type_p), column_idx(column_idx_p), row(std::move(row_p)), + : error_message(std::move(error_message_p)), type(type_p), column_idx(column_idx_p), csv_row(std::move(csv_row_p)), error_info(error_info_p) { } @@ -102,8 +102,7 @@ CSVError CSVError::ColumnTypesError(case_insensitive_map_t sql_types_per_ return CSVError(exception, CSVErrorType::COLUMN_NAME_TYPE_MISMATCH, {}); } -CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, idx_t column_idx, - vector &row, LinesPerBoundary error_info) { +CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, idx_t column_idx, string &csv_row, LinesPerBoundary error_info) { std::ostringstream error; // Which column error << "Error when converting column \"" << column_name << "\"." << std::endl; @@ -112,7 +111,7 @@ CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_nam error << std::endl; // What were the options error << options.ToString(); - return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, row, error_info); + return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, csv_row, error_info); } CSVError CSVError::LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info) { diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index 4d5eeada36eb..c40045b74bc5 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -50,13 +50,13 @@ enum CSVErrorType : uint8_t { class CSVError { public: CSVError() {}; - CSVError(string error_message, CSVErrorType type, idx_t column_idx, vector row, LinesPerBoundary error_info); + CSVError(string error_message, CSVErrorType type, idx_t column_idx, string csv_row, LinesPerBoundary error_info); CSVError(string error_message, CSVErrorType type, LinesPerBoundary error_info); //! Produces error messages for column name -> type mismatch. static CSVError ColumnTypesError(case_insensitive_map_t sql_types_per_column, const vector &names); //! Produces error messages for casting errors static CSVError CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, - idx_t column_idx, vector &row, LinesPerBoundary error_info); + idx_t column_idx, string &csv_row, LinesPerBoundary error_info); //! Produces error for when the line size exceeds the maximum line size option static CSVError LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info); //! Produces error for when the sniffer couldn't find viable options @@ -80,8 +80,8 @@ class CSVError { CSVErrorType type; //! Column Index where error happened idx_t column_idx; - //! Values from the row where error happened - vector row; + //! Original CSV row where error happened + string csv_row; //! Line information regarding this error LinesPerBoundary error_info; }; diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 2b1475f92861..4750da6b65db 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -117,6 +117,8 @@ class StringValueResult : public ScannerResult { //! Handles EmptyLine states static inline bool EmptyLine(StringValueResult &result, const idx_t buffer_pos); inline bool AddRowInternal(); + //! Reconstructs the current line to be used in error messages + string ReconstructCurrentLine(); void HandleOverLimitRows(); From bf320b4c87b52955b566a3864d0a3c5b84a94c37 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 23 Feb 2024 17:45:08 -0300 Subject: [PATCH 004/147] wip --- .../operator/csv_scanner/table_function/global_csv_state.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 2f8e92e3718f..4bd982f4571f 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -168,10 +168,15 @@ void CSVGlobalState::FillRejectsTable() { auto col_name = bind_data.return_names[col_idx]; // Add the row to the rejects table appender.BeginRow(); + // 1. File Name appender.Append(string_t(file_name)); + // 2. Row Line appender.Append(row_line); + // 3. Column Index appender.Append(col_idx); + // 4. Column Name appender.Append(string_t("\"" + col_name + "\"")); + // 5 Parsed Value appender.Append(error.row[col_idx]); auto row_error_msg = From ba93182badfe9c0a608d175308898803cef090cf Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 28 Feb 2024 09:16:54 -0300 Subject: [PATCH 005/147] Enum for CSV Errors, cleaning up output --- .../scanner/string_value_scanner.cpp | 13 ++++----- .../table_function/global_csv_state.cpp | 18 +++++------- .../operator/csv_scanner/util/csv_error.cpp | 3 +- .../operator/persistent/csv_rejects_table.cpp | 29 +++++++++++++++++-- 4 files changed, 42 insertions(+), 21 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 3569bfab62ec..d5a295190706 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -323,18 +323,17 @@ void StringValueResult::NullPaddingQuotedNewlineCheck() { } //! Reconstructs the current line to be used in error messages -string StringValueResult::ReconstructCurrentLine(){ - LinePosition current_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, - buffer_size}; +string StringValueResult::ReconstructCurrentLine() { + LinePosition current_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, buffer_size}; idx_t current_line_size = current_line_start - previous_line_start; string result; result.resize(current_line_size); - if (iterator.pos.buffer_idx == previous_line_start.buffer_idx){ + if (iterator.pos.buffer_idx == previous_line_start.buffer_idx) { idx_t result_idx = 0; - for (idx_t i = previous_line_start.buffer_pos; i < iterator.pos.buffer_pos; i ++){ + for (idx_t i = previous_line_start.buffer_pos; i < iterator.pos.buffer_pos; i++) { result[result_idx++] = buffer_ptr[i]; } - } else{ + } else { throw InternalException("Oh no"); } return result; @@ -622,7 +621,7 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { } LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - parse_chunk.size() + line_error); -// auto borked_line = result.ReconstructCurrentLine(); + // auto borked_line = result.ReconstructCurrentLine(); string empty; auto csv_error = CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], error_message, col_idx, empty, lines_per_batch); diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index f8b8000e9843..7e3a24d6f77c 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -154,7 +154,7 @@ void CSVGlobalState::FillRejectsTable() { for (auto &error_vector : errors) { for (auto &error : error_vector.second) { if (error.type != CSVErrorType::CAST_ERROR) { - // For now we only will use it for casting errors + // For now, we only will use it for casting errors continue; } // short circuit if we already have too many rejects @@ -168,21 +168,17 @@ void CSVGlobalState::FillRejectsTable() { auto col_name = bind_data.return_names[col_idx]; // Add the row to the rejects table appender.BeginRow(); - // 1. File Name + // 1. File Path appender.Append(string_t(file_name)); // 2. Row Line appender.Append(row_line); - // 3. Column Index + // 3. Column Index (If Applicable) appender.Append(col_idx); - // 4. Column Name + // 4. Column Name (If Applicable) appender.Append(string_t("\"" + col_name + "\"")); - // 5 Parsed Value - appender.Append(error.row[col_idx]); - - auto row_error_msg = - StringUtil::Format("Could not convert string '%s' to '%s'", error.row[col_idx].ToString(), - file->types[col_idx].ToString()); - appender.Append(string_t(row_error_msg)); + // 5. Error Type (ENUM?) + // 6. Full Error Message + // 7. Original CSV Line appender.EndRow(); } appender.Close(); diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index c96893ed4cc3..4bd08a7d7a8d 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -102,7 +102,8 @@ CSVError CSVError::ColumnTypesError(case_insensitive_map_t sql_types_per_ return CSVError(exception, CSVErrorType::COLUMN_NAME_TYPE_MISMATCH, {}); } -CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, idx_t column_idx, string &csv_row, LinesPerBoundary error_info) { +CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, idx_t column_idx, + string &csv_row, LinesPerBoundary error_info) { std::ostringstream error; // Which column error << "Error when converting column \"" << column_name << "\"." << std::endl; diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index 7d01723a7718..2e64d637e0ec 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -3,6 +3,7 @@ #include "duckdb/function/table/read_csv.hpp" #include "duckdb/execution/operator/persistent/csv_rejects_table.hpp" #include "duckdb/catalog/catalog_entry/table_catalog_entry.hpp" +#include "duckdb/parser/parsed_data/create_type_info.hpp" namespace duckdb { @@ -21,15 +22,39 @@ shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData &data) { // (Re)Create the temporary rejects table auto &catalog = Catalog::GetCatalog(context, TEMP_CATALOG); + + // Create CSV_ERROR_TYPE ENUM + string enum_name = "CSV_ERROR_TYPE"; + Vector order_errors(LogicalType::VARCHAR, 5); + order_errors.SetValue(0, "CAST"); + order_errors.SetValue(0, "MISSING COLUMNS"); + order_errors.SetValue(0, "TOO MANY COLUMNS"); + order_errors.SetValue(0, "UNQUOTED VALUE"); + order_errors.SetValue(0, "LINE SIZE OVER MAXIMUM"); + LogicalType enum_type = LogicalType::ENUM(enum_name, order_errors, 5); + auto type_info = make_uniq(enum_name, enum_type); + type_info->temporary = true; + type_info->on_conflict = OnCreateConflict::IGNORE_ON_CONFLICT; + catalog.CreateType(context, *type_info); + + // Create Rejects Table auto info = make_uniq(TEMP_CATALOG, DEFAULT_SCHEMA, name); info->temporary = true; info->on_conflict = OnCreateConflict::ERROR_ON_CONFLICT; + // 1. File Path info->columns.AddColumn(ColumnDefinition("file", LogicalType::VARCHAR)); + // 2. Row Line info->columns.AddColumn(ColumnDefinition("line", LogicalType::BIGINT)); + // 3. Column Index (If Applicable) info->columns.AddColumn(ColumnDefinition("column", LogicalType::BIGINT)); + // 4. Column Name (If Applicable) info->columns.AddColumn(ColumnDefinition("column_name", LogicalType::VARCHAR)); - info->columns.AddColumn(ColumnDefinition("parsed_value", LogicalType::VARCHAR)); - info->columns.AddColumn(ColumnDefinition("error", LogicalType::VARCHAR)); + // 5. Error Type + info->columns.AddColumn(ColumnDefinition("error_type", enum_type)); + // 6. Full Error Message + info->columns.AddColumn(ColumnDefinition("error_message", LogicalType::VARCHAR)); + // 7. Original CSV Line + info->columns.AddColumn(ColumnDefinition("csv_line", LogicalType::VARCHAR)); catalog.CreateTable(context, std::move(info)); From 551865f39d953c3de33e6ea0781940f5896d7e08 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 28 Feb 2024 10:11:18 -0300 Subject: [PATCH 006/147] Several tweaks for the tables --- .../scanner/string_value_scanner.cpp | 9 ++++----- .../table_function/global_csv_state.cpp | 8 +++++--- .../operator/persistent/csv_rejects_table.cpp | 18 ++++++++---------- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index d5a295190706..1002d9e6b164 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -324,13 +324,12 @@ void StringValueResult::NullPaddingQuotedNewlineCheck() { //! Reconstructs the current line to be used in error messages string StringValueResult::ReconstructCurrentLine() { - LinePosition current_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, buffer_size}; - idx_t current_line_size = current_line_start - previous_line_start; + idx_t current_line_size = previous_line_start - pre_previous_line_start; string result; - result.resize(current_line_size); - if (iterator.pos.buffer_idx == previous_line_start.buffer_idx) { + result.resize(current_line_size - 1); + if (previous_line_start.buffer_idx == pre_previous_line_start.buffer_idx) { idx_t result_idx = 0; - for (idx_t i = previous_line_start.buffer_pos; i < iterator.pos.buffer_pos; i++) { + for (idx_t i = pre_previous_line_start.buffer_pos + 1; i < previous_line_start.buffer_pos; i++) { result[result_idx++] = buffer_ptr[i]; } } else { diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 7e3a24d6f77c..f719a83521fd 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -147,7 +147,6 @@ void CSVGlobalState::FillRejectsTable() { lock_guard lock(rejects->write_lock); auto &table = rejects->GetTable(context); InternalAppender appender(context, table); - for (auto &file : file_scans) { auto file_name = file->file_path; auto &errors = file->error_handler->errors; @@ -177,8 +176,11 @@ void CSVGlobalState::FillRejectsTable() { // 4. Column Name (If Applicable) appender.Append(string_t("\"" + col_name + "\"")); // 5. Error Type (ENUM?) - // 6. Full Error Message - // 7. Original CSV Line + appender.Append(string_t("CAST")); + // 6. Original CSV Line + appender.Append(string_t(error.csv_row)); + // 7. Full Error Message + appender.Append(string_t(error.error_message)); appender.EndRow(); } appender.Close(); diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index 2e64d637e0ec..d1c9f13169aa 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -27,10 +27,10 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData string enum_name = "CSV_ERROR_TYPE"; Vector order_errors(LogicalType::VARCHAR, 5); order_errors.SetValue(0, "CAST"); - order_errors.SetValue(0, "MISSING COLUMNS"); - order_errors.SetValue(0, "TOO MANY COLUMNS"); - order_errors.SetValue(0, "UNQUOTED VALUE"); - order_errors.SetValue(0, "LINE SIZE OVER MAXIMUM"); + order_errors.SetValue(1, "MISSING COLUMNS"); + order_errors.SetValue(2, "TOO MANY COLUMNS"); + order_errors.SetValue(3, "UNQUOTED VALUE"); + order_errors.SetValue(4, "LINE SIZE OVER MAXIMUM"); LogicalType enum_type = LogicalType::ENUM(enum_name, order_errors, 5); auto type_info = make_uniq(enum_name, enum_type); type_info->temporary = true; @@ -46,18 +46,16 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData // 2. Row Line info->columns.AddColumn(ColumnDefinition("line", LogicalType::BIGINT)); // 3. Column Index (If Applicable) - info->columns.AddColumn(ColumnDefinition("column", LogicalType::BIGINT)); + info->columns.AddColumn(ColumnDefinition("column_idx", LogicalType::BIGINT)); // 4. Column Name (If Applicable) info->columns.AddColumn(ColumnDefinition("column_name", LogicalType::VARCHAR)); // 5. Error Type info->columns.AddColumn(ColumnDefinition("error_type", enum_type)); - // 6. Full Error Message - info->columns.AddColumn(ColumnDefinition("error_message", LogicalType::VARCHAR)); - // 7. Original CSV Line + // 6. Original CSV Line info->columns.AddColumn(ColumnDefinition("csv_line", LogicalType::VARCHAR)); - + // 7. Full Error Message + info->columns.AddColumn(ColumnDefinition("error_message", LogicalType::VARCHAR)); catalog.CreateTable(context, std::move(info)); - count = 0; } From 5eae50f43d7824b66785b0778d419bff06bcce65 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 28 Feb 2024 12:30:17 -0300 Subject: [PATCH 007/147] We can also store the global byte where an error shows up --- .../csv_scanner/buffer_manager/csv_buffer.cpp | 10 +++++----- .../csv_scanner/scanner/string_value_scanner.cpp | 9 +++++---- .../table_function/global_csv_state.cpp | 12 +++++++----- .../operator/csv_scanner/util/csv_error.cpp | 8 ++++---- .../operator/persistent/csv_rejects_table.cpp | 16 +++++++++------- .../operator/csv_scanner/csv_buffer.hpp | 12 +++++++----- .../execution/operator/csv_scanner/csv_error.hpp | 7 +++++-- .../csv_scanner/string_value_scanner.hpp | 3 +++ 8 files changed, 45 insertions(+), 32 deletions(-) diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp index 8c29ae79fb43..e5a53bdeb1f3 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp @@ -5,7 +5,7 @@ namespace duckdb { CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle, idx_t &global_csv_current_position, idx_t file_number_p) - : context(context), file_number(file_number_p), can_seek(file_handle.CanSeek()) { + : context(context), requested_size(buffer_size_p), file_number(file_number_p), can_seek(file_handle.CanSeek()) { AllocateBuffer(buffer_size_p); auto buffer = Ptr(); actual_buffer_size = file_handle.Read(buffer, buffer_size_p); @@ -19,8 +19,8 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t buffer_size, idx_t global_csv_current_position, idx_t file_number_p, idx_t buffer_idx_p) - : context(context), global_csv_start(global_csv_current_position), file_number(file_number_p), - can_seek(file_handle.CanSeek()), buffer_idx(buffer_idx_p) { + : context(context), requested_size(buffer_size), global_csv_start(global_csv_current_position), + file_number(file_number_p), can_seek(file_handle.CanSeek()), buffer_idx(buffer_idx_p) { AllocateBuffer(buffer_size); auto buffer = handle.Ptr(); actual_buffer_size = file_handle.Read(handle.Ptr(), buffer_size); @@ -73,8 +73,8 @@ shared_ptr CSVBuffer::Pin(CSVFileHandle &file_handle, bool &has Reload(file_handle); has_seeked = true; } - return make_shared(buffer_manager.Pin(block), actual_buffer_size, last_buffer, file_number, - buffer_idx); + return make_shared(buffer_manager.Pin(block), actual_buffer_size, requested_size, last_buffer, + file_number, buffer_idx); } void CSVBuffer::Unpin() { diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 1002d9e6b164..10bd3ce3029d 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -354,8 +354,9 @@ bool StringValueResult::AddRowInternal() { auto error_string = error.str(); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - 1); auto borked_line = ReconstructCurrentLine(); - auto csv_error = CSVError::CastError(state_machine.options, names[cast_error.first], error_string, - cast_error.first, borked_line, lines_per_batch); + auto csv_error = CSVError::CastError( + state_machine.options, names[cast_error.first], error_string, cast_error.first, borked_line, + lines_per_batch, pre_previous_line_start.GetGlobalPosition(buffer_handles.front()->requested_size)); error_handler.Error(csv_error); } // If we got here it means we are ignoring errors, hence we need to signify to our result scanner to ignore this @@ -623,7 +624,7 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { // auto borked_line = result.ReconstructCurrentLine(); string empty; auto csv_error = CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], - error_message, col_idx, empty, lines_per_batch); + error_message, col_idx, empty, lines_per_batch, 0); error_handler->Error(csv_error); } borked_lines.insert(line_error++); @@ -641,7 +642,7 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { lines_read - parse_chunk.size() + line_error); string empty; auto csv_error = CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], - error_message, col_idx, empty, lines_per_batch); + error_message, col_idx, empty, lines_per_batch, 0); error_handler->Error(csv_error); } diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index f719a83521fd..77982d83fab2 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -171,15 +171,17 @@ void CSVGlobalState::FillRejectsTable() { appender.Append(string_t(file_name)); // 2. Row Line appender.Append(row_line); - // 3. Column Index (If Applicable) + // 3. Byte Position where error occurred + appender.Append(error.byte_position); + // 4. Column Index (If Applicable) appender.Append(col_idx); - // 4. Column Name (If Applicable) + // 5. Column Name (If Applicable) appender.Append(string_t("\"" + col_name + "\"")); - // 5. Error Type (ENUM?) + // 6. Error Type (ENUM?) appender.Append(string_t("CAST")); - // 6. Original CSV Line + // 7. Original CSV Line appender.Append(string_t(error.csv_row)); - // 7. Full Error Message + // 8. Full Error Message appender.Append(string_t(error.error_message)); appender.EndRow(); } diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index 4bd08a7d7a8d..b77437c88f6b 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -77,9 +77,9 @@ CSVError::CSVError(string error_message_p, CSVErrorType type_p, LinesPerBoundary } CSVError::CSVError(string error_message_p, CSVErrorType type_p, idx_t column_idx_p, string csv_row_p, - LinesPerBoundary error_info_p) + LinesPerBoundary error_info_p, idx_t byte_position_p) : error_message(std::move(error_message_p)), type(type_p), column_idx(column_idx_p), csv_row(std::move(csv_row_p)), - error_info(error_info_p) { + error_info(error_info_p), byte_position(byte_position_p) { } CSVError CSVError::ColumnTypesError(case_insensitive_map_t sql_types_per_column, const vector &names) { @@ -103,7 +103,7 @@ CSVError CSVError::ColumnTypesError(case_insensitive_map_t sql_types_per_ } CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, idx_t column_idx, - string &csv_row, LinesPerBoundary error_info) { + string &csv_row, LinesPerBoundary error_info, idx_t byte_position) { std::ostringstream error; // Which column error << "Error when converting column \"" << column_name << "\"." << std::endl; @@ -112,7 +112,7 @@ CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_nam error << std::endl; // What were the options error << options.ToString(); - return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, csv_row, error_info); + return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, csv_row, error_info, byte_position); } CSVError CSVError::LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info) { diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index d1c9f13169aa..3f2acf553f21 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -44,16 +44,18 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData // 1. File Path info->columns.AddColumn(ColumnDefinition("file", LogicalType::VARCHAR)); // 2. Row Line - info->columns.AddColumn(ColumnDefinition("line", LogicalType::BIGINT)); - // 3. Column Index (If Applicable) - info->columns.AddColumn(ColumnDefinition("column_idx", LogicalType::BIGINT)); - // 4. Column Name (If Applicable) + info->columns.AddColumn(ColumnDefinition("line", LogicalType::UBIGINT)); + // 3. Byte Position where error occurred + info->columns.AddColumn(ColumnDefinition("byte_position", LogicalType::UBIGINT)); + // 4. Column Index (If Applicable) + info->columns.AddColumn(ColumnDefinition("column_idx", LogicalType::UBIGINT)); + // 5. Column Name (If Applicable) info->columns.AddColumn(ColumnDefinition("column_name", LogicalType::VARCHAR)); - // 5. Error Type + // 6. Error Type info->columns.AddColumn(ColumnDefinition("error_type", enum_type)); - // 6. Original CSV Line + // 7. Original CSV Line info->columns.AddColumn(ColumnDefinition("csv_line", LogicalType::VARCHAR)); - // 7. Full Error Message + // 8. Full Error Message info->columns.AddColumn(ColumnDefinition("error_message", LogicalType::VARCHAR)); catalog.CreateTable(context, std::move(info)); count = 0; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp index 72665ae2de54..e71b75e19553 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp @@ -18,16 +18,17 @@ namespace duckdb { class CSVBufferHandle { public: - CSVBufferHandle(BufferHandle handle_p, idx_t actual_size_p, const bool is_final_buffer_p, idx_t file_idx_p, - idx_t buffer_index_p) - : handle(std::move(handle_p)), actual_size(actual_size_p), is_last_buffer(is_final_buffer_p), - file_idx(file_idx_p), buffer_idx(buffer_index_p) {}; - CSVBufferHandle() : actual_size(0), is_last_buffer(false), file_idx(0), buffer_idx(0) {}; + CSVBufferHandle(BufferHandle handle_p, idx_t actual_size_p, idx_t requested_size_p, const bool is_final_buffer_p, + idx_t file_idx_p, idx_t buffer_index_p) + : handle(std::move(handle_p)), actual_size(actual_size_p), requested_size(requested_size_p), + is_last_buffer(is_final_buffer_p), file_idx(file_idx_p), buffer_idx(buffer_index_p) {}; + CSVBufferHandle() : actual_size(0), requested_size(0), is_last_buffer(false), file_idx(0), buffer_idx(0) {}; ~CSVBufferHandle() { } //! Handle created during allocation BufferHandle handle; const idx_t actual_size; + const idx_t requested_size; const bool is_last_buffer; const idx_t file_idx; const idx_t buffer_idx; @@ -86,6 +87,7 @@ class CSVBuffer { ClientContext &context; //! Actual size can be smaller than the buffer size in case we allocate it too optimistically. idx_t actual_buffer_size; + idx_t requested_size; //! Global position from the CSV File where this buffer starts idx_t global_csv_start = 0; //! Number of the file that is in this buffer diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index c40045b74bc5..44bd4f25913a 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -50,13 +50,14 @@ enum CSVErrorType : uint8_t { class CSVError { public: CSVError() {}; - CSVError(string error_message, CSVErrorType type, idx_t column_idx, string csv_row, LinesPerBoundary error_info); + CSVError(string error_message, CSVErrorType type, idx_t column_idx, string csv_row, LinesPerBoundary error_info, + idx_t byte_position); CSVError(string error_message, CSVErrorType type, LinesPerBoundary error_info); //! Produces error messages for column name -> type mismatch. static CSVError ColumnTypesError(case_insensitive_map_t sql_types_per_column, const vector &names); //! Produces error messages for casting errors static CSVError CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, - idx_t column_idx, string &csv_row, LinesPerBoundary error_info); + idx_t column_idx, string &csv_row, LinesPerBoundary error_info, idx_t byte_position); //! Produces error for when the line size exceeds the maximum line size option static CSVError LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info); //! Produces error for when the sniffer couldn't find viable options @@ -84,6 +85,8 @@ class CSVError { string csv_row; //! Line information regarding this error LinesPerBoundary error_info; + //! Global Byte Position where error occurred. + idx_t byte_position; }; class CSVErrorHandler { diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 4750da6b65db..8087888393a4 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -42,6 +42,9 @@ class LinePosition { } return other.buffer_size - other.buffer_pos + buffer_pos; } + idx_t GetGlobalPosition(idx_t requested_buffer_size) { + return requested_buffer_size * buffer_idx + buffer_pos + 1; + } idx_t buffer_pos = 0; idx_t buffer_size = 0; idx_t buffer_idx = 0; From 5951413e0cd88990714a409854717755e6fcc158 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 28 Feb 2024 13:51:26 -0300 Subject: [PATCH 008/147] Fixing up old tests and fixing small bugs --- .../scanner/string_value_scanner.cpp | 33 +-- .../csv_scanner/string_value_scanner.hpp | 3 + .../copy/csv/rejects/csv_rejects_read.test | 232 +++++++++++------- .../csv/rejects/test_invalid_parameters.test | 56 +++++ 4 files changed, 222 insertions(+), 102 deletions(-) create mode 100644 test/sql/copy/csv/rejects/test_invalid_parameters.test diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 10bd3ce3029d..daa56dd209e0 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -26,6 +26,7 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m buffer_ptr = buffer_handle->Ptr(); buffer_size = buffer_handle->actual_size; last_position = buffer_position; + requested_size = buffer_handle->requested_size; // Current Result information previous_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, buffer_handle->actual_size}; @@ -354,9 +355,9 @@ bool StringValueResult::AddRowInternal() { auto error_string = error.str(); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - 1); auto borked_line = ReconstructCurrentLine(); - auto csv_error = CSVError::CastError( - state_machine.options, names[cast_error.first], error_string, cast_error.first, borked_line, - lines_per_batch, pre_previous_line_start.GetGlobalPosition(buffer_handles.front()->requested_size)); + auto csv_error = CSVError::CastError(state_machine.options, names[cast_error.first], error_string, + cast_error.first, borked_line, lines_per_batch, + pre_previous_line_start.GetGlobalPosition(requested_size)); error_handler.Error(csv_error); } // If we got here it means we are ignoring errors, hence we need to signify to our result scanner to ignore this @@ -414,20 +415,20 @@ bool StringValueResult::AddRowInternal() { } bool StringValueResult::AddRow(StringValueResult &result, const idx_t buffer_pos) { + LinePosition current_line_start = {result.iterator.pos.buffer_idx, result.iterator.pos.buffer_pos, + result.buffer_size}; + idx_t current_line_size = current_line_start - result.previous_line_start; + if (result.store_line_size) { + result.error_handler.NewMaxLineSize(current_line_size); + } + if (current_line_size > result.state_machine.options.maximum_line_size) { + LinesPerBoundary lines_per_batch(result.iterator.GetBoundaryIdx(), result.number_of_rows); + auto csv_error = CSVError::LineSizeError(result.state_machine.options, current_line_size, lines_per_batch); + result.error_handler.Error(csv_error); + } + result.pre_previous_line_start = result.previous_line_start; + result.previous_line_start = current_line_start; if (result.last_position <= buffer_pos) { - LinePosition current_line_start = {result.iterator.pos.buffer_idx, result.iterator.pos.buffer_pos, - result.buffer_size}; - idx_t current_line_size = current_line_start - result.previous_line_start; - if (result.store_line_size) { - result.error_handler.NewMaxLineSize(current_line_size); - } - if (current_line_size > result.state_machine.options.maximum_line_size) { - LinesPerBoundary lines_per_batch(result.iterator.GetBoundaryIdx(), result.number_of_rows); - auto csv_error = CSVError::LineSizeError(result.state_machine.options, current_line_size, lines_per_batch); - result.error_handler.Error(csv_error); - } - result.pre_previous_line_start = result.previous_line_start; - result.previous_line_start = current_line_start; // We add the value if (result.quoted) { StringValueResult::AddQuotedValue(result, buffer_pos); diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 8087888393a4..00e0bb80b2c2 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -104,6 +104,9 @@ class StringValueResult : public ScannerResult { //! We must ensure that we keep the buffers alive until processing the query result vector> buffer_handles; + //! Requested size of buffers (i.e., either 32Mb or set by buffer_size parameter) + idx_t requested_size; + //! If the current row has an error, we have to skip it bool ignore_current_row = false; //! Specialized code for quoted values, makes sure to remove quotes and escapes diff --git a/test/sql/copy/csv/rejects/csv_rejects_read.test b/test/sql/copy/csv/rejects/csv_rejects_read.test index 16b73ae7ac65..5713e91760a0 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_read.test +++ b/test/sql/copy/csv/rejects/csv_rejects_read.test @@ -6,54 +6,6 @@ require skip_reload # FIXME: https://github.com/duckdb/duckdb/issues/7755 require vector_size 2048 -# Test invalid arguments -statement error -SELECT * FROM read_csv( - 'test/sql/copy/csv/data/error/mismatch/bad.csv', - columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, - ignore_errors=false, - rejects_table='csv_rejects_table' -) ----- -only supported when IGNORE_ERRORS is set to true - -statement error -SELECT * FROM read_csv( - 'test/sql/copy/csv/data/error/mismatch/bad.csv', - columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, - ignore_errors=true, - rejects_table='') ----- -REJECTS_TABLE option cannot be empty - -statement error -SELECT * FROM read_csv( - 'test/sql/copy/csv/data/error/mismatch/bad.csv', - columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, - ignore_errors=true, - rejects_table='csv_rejects_table', - union_by_name=true) ----- -UNION_BY_NAME is set to true - -statement error -SELECT * FROM read_csv( - 'test/sql/copy/csv/data/error/mismatch/bad.csv', - columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, - ignore_errors=true, - rejects_limit=10) ----- -REJECTS_LIMIT option is only supported when REJECTS_TABLE is set to a table name - -statement error -SELECT * FROM read_csv( - 'test/sql/copy/csv/data/error/mismatch/bad.csv', - columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, - ignore_errors=true, - rejects_table='csv_rejects_table', - rejects_limit=-1) ----- -REJECTS_LIMIT: cannot be negative # Basic test query III rowsort @@ -66,11 +18,17 @@ SELECT * FROM read_csv( 1 2 AAA 6 7 CCC -query IIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "error", regexp_replace("file", '\\', '/', 'g') +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -2 1 "col1" BBB Could not convert string 'BBB' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/bad.csv +test/sql/copy/csv/data/error/mismatch/bad.csv 2 1 "col1" CAST 4,BBB,9, 9 + +query I +SELECT error_message +FROM csv_rejects_table; +---- +:.*Could not convert string "BBB" to 'INTEGER'.* statement ok DROP TABLE csv_rejects_table; @@ -85,13 +43,31 @@ SELECT * FROM read_csv( ---- 4 5 9 -query IIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "error", regexp_replace("file", '\\', '/', 'g') +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -1 2 "col2" DDD Could not convert string 'DDD' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/bad2.csv -3 0 "col0" EEE Could not convert string 'EEE' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/bad2.csv -3 2 "col2" FFF Could not convert string 'FFF' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/bad2.csv +test/sql/copy/csv/data/error/mismatch/bad2.csv 1 2 "col2" CAST ,2,DDD, 1 +test/sql/copy/csv/data/error/mismatch/bad2.csv 3 0 "col0" CAST EEE,7,FFF, 16 +test/sql/copy/csv/data/error/mismatch/bad2.csv 3 2 "col2" CAST EEE,7,FFF, 16 + +query I +SELECT error_message +FROM csv_rejects_table where line=1 and column_idx=2; +---- +:.*Could not convert string "DDD" to 'INTEGER'.* + +query I +SELECT error_message +FROM csv_rejects_table where line=3 and column_idx=0; +---- +:.*Could not convert string "EEE" to 'INTEGER'.* + +query I +SELECT error_message +FROM csv_rejects_table where line=3 and column_idx=2; +---- +:.*Could not convert string "FFF" to 'INTEGER'.* statement ok DROP TABLE csv_rejects_table; @@ -110,12 +86,24 @@ SELECT * FROM read_csv( 6 7 CCC -query IIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "error", regexp_replace("file", '\\', '/', 'g') +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -2 1 "col1" BBB Could not convert string 'BBB' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/bad.csv -3 0 "col0" EEE Could not convert string 'EEE' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/bad2.csv +test/sql/copy/csv/data/error/mismatch/bad.csv 2 1 "col1" CAST 4,BBB,9, 9 +test/sql/copy/csv/data/error/mismatch/bad2.csv 3 0 "col0" CAST EEE,7,FFF, 16 + +query I +SELECT error_message +FROM csv_rejects_table where line=2 and column_idx=1; +---- +:.*Could not convert string "BBB" to 'INTEGER'.* + +query I +SELECT error_message +FROM csv_rejects_table where line=3 and column_idx=0; +---- +:.*Could not convert string "EEE" to 'INTEGER'.* statement ok DROP TABLE csv_rejects_table; @@ -154,12 +142,24 @@ SELECT SUM(num) FROM read_csv( ---- 4270 -query IIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "error", regexp_replace("file", '\\', '/', 'g') +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -2176 0 "num" B Could not convert string 'B' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/big_bad.csv -4176 0 "num" C Could not convert string 'C' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/big_bad.csv +test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 0 "num" CAST B, A 10875 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 0 "num" CAST C, A 20875 + +query I +SELECT error_message +FROM csv_rejects_table where line=2176 and column_idx=0; +---- +:.*Could not convert string "B" to 'INTEGER'.* + +query I +SELECT error_message +FROM csv_rejects_table where line=4176 and column_idx=0; +---- +:.*Could not convert string "C" to 'INTEGER'.* statement ok DROP TABLE csv_rejects_table; @@ -173,12 +173,24 @@ SELECT SUM(num) FROM read_csv( ---- 6774 -query IIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "error", regexp_replace("file", '\\', '/', 'g') +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -3680 0 "num" B Could not convert string 'B' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/big_bad2.csv -5680 0 "num" C Could not convert string 'C' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/big_bad2.csv +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 0 "num" CAST B, A 18395 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 0 "num" CAST C, A 28395 + +query I +SELECT error_message +FROM csv_rejects_table where line=3680 and column_idx=0; +---- +:.*Could not convert string "B" to 'INTEGER'.* + +query I +SELECT error_message +FROM csv_rejects_table where line=5680 and column_idx=0; +---- +:.*Could not convert string "C" to 'INTEGER'.* statement ok DROP TABLE csv_rejects_table; @@ -193,14 +205,38 @@ SELECT SUM(num) FROM read_csv( ---- 11044 -query IIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "error", regexp_replace("file", '\\', '/', 'g') +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -2176 0 "num" B Could not convert string 'B' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/big_bad.csv -3680 0 "num" B Could not convert string 'B' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/big_bad2.csv -4176 0 "num" C Could not convert string 'C' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/big_bad.csv -5680 0 "num" C Could not convert string 'C' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/big_bad2.csv +test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 0 "num" CAST B, A 10875 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 0 "num" CAST C, A 20875 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 0 "num" CAST B, A 18395 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 0 "num" CAST C, A 28395 + +query I +SELECT error_message +FROM csv_rejects_table where line=3680 and column_idx=0; +---- +:.*Could not convert string "B" to 'INTEGER'.* + +query I +SELECT error_message +FROM csv_rejects_table where line=5680 and column_idx=0; +---- +:.*Could not convert string "C" to 'INTEGER'.* + +query I +SELECT error_message +FROM csv_rejects_table where line=2176 and column_idx=0; +---- +:.*Could not convert string "B" to 'INTEGER'.* + +query I +SELECT error_message +FROM csv_rejects_table where line=4176 and column_idx=0; +---- +:.*Could not convert string "C" to 'INTEGER'.* statement ok DROP TABLE csv_rejects_table; @@ -223,19 +259,43 @@ ON L.num = R.num; 1 A 1 A 3 C 3 C -query IIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "error", regexp_replace("file", '\\', '/', 'g') +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table_left; ---- -3 0 "num" X Could not convert string 'X' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/small1.csv -6 0 "num" X Could not convert string 'X' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/small1.csv +test/sql/copy/csv/data/error/mismatch/small1.csv 3 0 "num" CAST X,Y 14 +test/sql/copy/csv/data/error/mismatch/small1.csv 6 0 "num" CAST X,Y 26 -query IIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "error", regexp_replace("file", '\\', '/', 'g') +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table_right; ---- -3 0 "num" X Could not convert string 'X' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/small2.csv -5 0 "num" X Could not convert string 'X' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/small2.csv +test/sql/copy/csv/data/error/mismatch/small2.csv 3 0 "num" CAST X,Y 14 +test/sql/copy/csv/data/error/mismatch/small2.csv 5 0 "num" CAST X,Y 22 + +query I +SELECT error_message +FROM csv_rejects_table_left where line=3 and column_idx=0; +---- +:.*Could not convert string "X" to 'INTEGER'.* + +query I +SELECT error_message +FROM csv_rejects_table_left where line=6 and column_idx=0; +---- +:.*Could not convert string "X" to 'INTEGER'.* + +query I +SELECT error_message +FROM csv_rejects_table_right where line=3 and column_idx=0; +---- +:.*Could not convert string "X" to 'INTEGER'.* + +query I +SELECT error_message +FROM csv_rejects_table_right where line=5 and column_idx=0; +---- +:.*Could not convert string "X" to 'INTEGER'.* statement ok DROP TABLE csv_rejects_table_left; @@ -264,12 +324,12 @@ ON L.num = R.num; 3 C 3 C -query IIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "error", regexp_replace("file", '\\', '/', 'g') +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table_left; ---- -3 0 "num" X Could not convert string 'X' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/small1.csv -6 0 "num" X Could not convert string 'X' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/small1.csv +test/sql/copy/csv/data/error/mismatch/small1.csv 3 0 "num" CAST X,Y 14 +test/sql/copy/csv/data/error/mismatch/small1.csv 6 0 "num" CAST X,Y 26 query I SELECT COUNT(*) diff --git a/test/sql/copy/csv/rejects/test_invalid_parameters.test b/test/sql/copy/csv/rejects/test_invalid_parameters.test new file mode 100644 index 000000000000..d403d0274948 --- /dev/null +++ b/test/sql/copy/csv/rejects/test_invalid_parameters.test @@ -0,0 +1,56 @@ +# name: test/sql/copy/csv/rejects/test_invalid_parameters.test +# group: [rejects] + +require skip_reload + +# FIXME: https://github.com/duckdb/duckdb/issues/7755 +require vector_size 2048 + +# Test invalid arguments +statement error +SELECT * FROM read_csv( + 'test/sql/copy/csv/data/error/mismatch/bad.csv', + columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, + ignore_errors=false, + rejects_table='csv_rejects_table' +) +---- +only supported when IGNORE_ERRORS is set to true + +statement error +SELECT * FROM read_csv( + 'test/sql/copy/csv/data/error/mismatch/bad.csv', + columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, + ignore_errors=true, + rejects_table='') +---- +REJECTS_TABLE option cannot be empty + +statement error +SELECT * FROM read_csv( + 'test/sql/copy/csv/data/error/mismatch/bad.csv', + columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, + ignore_errors=true, + rejects_table='csv_rejects_table', + union_by_name=true) +---- +UNION_BY_NAME is set to true + +statement error +SELECT * FROM read_csv( + 'test/sql/copy/csv/data/error/mismatch/bad.csv', + columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, + ignore_errors=true, + rejects_limit=10) +---- +REJECTS_LIMIT option is only supported when REJECTS_TABLE is set to a table name + +statement error +SELECT * FROM read_csv( + 'test/sql/copy/csv/data/error/mismatch/bad.csv', + columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, + ignore_errors=true, + rejects_table='csv_rejects_table', + rejects_limit=-1) +---- +REJECTS_LIMIT: cannot be negative From a8f2dcd01d18af6b0680fd90c81b7d3b427a79ba Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 28 Feb 2024 14:15:41 -0300 Subject: [PATCH 009/147] remove old parameter, cleanup of older tests --- .../copy/csv/rejects/csv_rejects_auto.test | 146 +++--------------- .../copy/csv/rejects/csv_rejects_read.test | 1 - .../csv/rejects/csv_rejects_recovery.test | 97 ------------ .../csv/rejects/test_invalid_parameters.test | 57 +++++++ 4 files changed, 78 insertions(+), 223 deletions(-) delete mode 100644 test/sql/copy/csv/rejects/csv_rejects_recovery.test diff --git a/test/sql/copy/csv/rejects/csv_rejects_auto.test b/test/sql/copy/csv/rejects/csv_rejects_auto.test index 5dc4358708de..887bd282db73 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_auto.test +++ b/test/sql/copy/csv/rejects/csv_rejects_auto.test @@ -6,63 +6,6 @@ require skip_reload # FIXME: https://github.com/duckdb/duckdb/issues/7755 require vector_size 2048 -# Test invalid arguments -statement error -SELECT * FROM read_csv_auto( - 'test/sql/copy/csv/data/error/mismatch/bad.csv', - ignore_errors=false, - rejects_table='csv_rejects_table' -) ----- -only supported when IGNORE_ERRORS is set to true - -statement error -SELECT * FROM read_csv_auto( - 'test/sql/copy/csv/data/error/mismatch/bad.csv', - ignore_errors=true, - rejects_table='') ----- -REJECTS_TABLE option cannot be empty - -statement error -SELECT * FROM read_csv_auto( - 'test/sql/copy/csv/data/error/mismatch/bad.csv', - ignore_errors=true, - rejects_table='csv_rejects_table', - union_by_name=true) ----- -UNION_BY_NAME is set to true - -statement error -SELECT * FROM read_csv_auto( - 'test/sql/copy/csv/data/error/mismatch/bad.csv', - ignore_errors=true, - rejects_limit=10) ----- -REJECTS_LIMIT option is only supported when REJECTS_TABLE is set to a table name - -statement error -SELECT * FROM read_csv_auto( - 'test/sql/copy/csv/data/error/mismatch/bad.csv', - ignore_errors=true, - rejects_table='csv_rejects_table', - rejects_limit=-1) ----- -REJECTS_LIMIT: cannot be negative - - -query III -SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*) FROM read_csv_auto( - 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', - sample_size=3000, - rejects_table='csv_rejects_table', - ignore_errors=true, header = 0); ----- -VARCHAR VARCHAR 11048 - -statement ok -DROP TABLE csv_rejects_table; - # Ensure that we can get the schema if we reduce the sample size and ignore errors query IIIII SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( @@ -73,85 +16,38 @@ SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), M ---- BIGINT VARCHAR 11044 11044 2 -query IIIIII rowsort -SELECT regexp_replace("file", '\\', '/', 'g') , "line", "column", "column_name", "parsed_value", "error" +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 0 "column0" B Could not convert string 'B' to 'BIGINT' -test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 0 "column0" C Could not convert string 'C' to 'BIGINT' -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 0 "column0" B Could not convert string 'B' to 'BIGINT' -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 0 "column0" C Could not convert string 'C' to 'BIGINT' +test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 0 "column0" CAST B, A 10875 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 0 "column0" CAST C, A 20875 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 0 "column0" CAST B, A 18395 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 0 "column0" CAST C, A 28395 -statement ok -DROP TABLE csv_rejects_table; - -# Test with recovery columns query I -SELECT SUM(COL1) FROM read_csv_auto( - 'test/sql/copy/csv/data/error/mismatch/part1.csv', - header=true, - ignore_errors=true, - sample_size=1, - rejects_table='csv_rejects_table', - rejects_recovery_columns=['COL2']); +SELECT error_message +FROM csv_rejects_table where line=2176 and column_idx=0; ---- -5230 - -statement ok -DROP TABLE csv_rejects_table; +:.*Could not convert string "B" to 'BIGINT'.* -# Test with recovery columns query I -SELECT SUM(COL1) FROM read_csv_auto( - 'test/sql/copy/csv/data/error/mismatch/part2.csv', - header=true, - ignore_errors=true, - sample_size=1, - rejects_table='csv_rejects_table', - rejects_recovery_columns=['COL2']); +SELECT error_message +FROM csv_rejects_table where line=4176 and column_idx=0; ---- -5418 - -statement ok -DROP TABLE csv_rejects_table; +:.*Could not convert string "C" to 'BIGINT'.* -# Test with recovery columns query I -SELECT SUM(COL1) FROM read_csv_auto( - 'test/sql/copy/csv/data/error/mismatch/part3.csv', - header=true, - ignore_errors=true, - sample_size=1, - rejects_table='csv_rejects_table', - rejects_recovery_columns=['COL2']); +SELECT error_message +FROM csv_rejects_table where line=3680 and column_idx=0; ---- -4151 - -statement ok -DROP TABLE csv_rejects_table; +:.*Could not convert string "B" to 'BIGINT'.* -# Test with recovery columns query I -SELECT SUM(COL1) FROM read_csv_auto( - 'test/sql/copy/csv/data/error/mismatch/part*.csv', - header=true, - ignore_errors=true, - sample_size=1, - rejects_table='csv_rejects_table', - rejects_recovery_columns=['COL2']); ----- -14799 - -query IIIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "recovery_columns", "error", regexp_replace("file", '\\', '/', 'g') -FROM csv_rejects_table; +SELECT error_message +FROM csv_rejects_table where line=5680 and column_idx=0; ---- -2058 0 "COL1" B {'COL2': BAD1B} Could not convert string 'B' to 'BIGINT' test/sql/copy/csv/data/error/mismatch/part3.csv -2325 0 "COL1" B {'COL2': BAD2B} Could not convert string 'B' to 'BIGINT' test/sql/copy/csv/data/error/mismatch/part2.csv -3137 0 "COL1" B {'COL2': BAD1B} Could not convert string 'B' to 'BIGINT' test/sql/copy/csv/data/error/mismatch/part1.csv -4058 0 "COL1" C {'COL2': BAD1C} Could not convert string 'C' to 'BIGINT' test/sql/copy/csv/data/error/mismatch/part3.csv -4325 0 "COL1" C {'COL2': BAD2C} Could not convert string 'C' to 'BIGINT' test/sql/copy/csv/data/error/mismatch/part2.csv -5137 0 "COL1" C {'COL2': BAD1C} Could not convert string 'C' to 'BIGINT' test/sql/copy/csv/data/error/mismatch/part1.csv +:.*Could not convert string "C" to 'BIGINT'.* statement ok DROP TABLE csv_rejects_table; @@ -180,7 +76,7 @@ statement ok CREATE TABLE tbl1 (col1 BIGINT, col2 VARCHAR); statement ok -COPY tbl1 FROM 'test/sql/copy/csv/data/error/mismatch/half1.csv' +COPY tbl1 FROM 'test/sql/copy/csv/data/error/mismatch/half1.csv' WITH (HEADER, IGNORE_ERRORS TRUE, SAMPLE_SIZE 1000, REJECTS_TABLE 'csv_rejects_table'); query I @@ -222,7 +118,7 @@ statement ok CREATE TABLE tbl1 (col1 BIGINT, col2 VARCHAR); statement ok -COPY tbl1 FROM 'test/sql/copy/csv/data/error/mismatch/half2.csv' +COPY tbl1 FROM 'test/sql/copy/csv/data/error/mismatch/half2.csv' WITH (HEADER, IGNORE_ERRORS TRUE, SAMPLE_SIZE 1000, REJECTS_TABLE 'csv_rejects_table'); query I @@ -246,7 +142,7 @@ statement ok CREATE TABLE tbl1 (col1 BIGINT, col2 VARCHAR); statement ok -COPY tbl1 FROM 'test/sql/copy/csv/data/error/mismatch/half2.csv' +COPY tbl1 FROM 'test/sql/copy/csv/data/error/mismatch/half2.csv' WITH (HEADER, IGNORE_ERRORS TRUE, SAMPLE_SIZE 1000, REJECTS_TABLE 'csv_rejects_table', REJECTS_LIMIT 1337); query I diff --git a/test/sql/copy/csv/rejects/csv_rejects_read.test b/test/sql/copy/csv/rejects/csv_rejects_read.test index 5713e91760a0..458e485ffc75 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_read.test +++ b/test/sql/copy/csv/rejects/csv_rejects_read.test @@ -7,7 +7,6 @@ require skip_reload require vector_size 2048 -# Basic test query III rowsort SELECT * FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/bad.csv', diff --git a/test/sql/copy/csv/rejects/csv_rejects_recovery.test b/test/sql/copy/csv/rejects/csv_rejects_recovery.test deleted file mode 100644 index 697e7c94a091..000000000000 --- a/test/sql/copy/csv/rejects/csv_rejects_recovery.test +++ /dev/null @@ -1,97 +0,0 @@ -# name: test/sql/copy/csv/rejects/csv_rejects_recovery.test -# group: [rejects] - -require skip_reload - -# Test invalid arguments - -# Should not work without rejects_table -statement error -SELECT SUM(COL1) + SUM(COL3) FROM read_csv( - 'test/sql/copy/csv/data/error/mismatch/part*.csv', - ignore_errors=true, - header=true, - columns = {COL3 :'INTEGER', COL1: 'INTEGER', COL2: 'VARCHAR'}, - rejects_recovery_columns=['COL2'] -); ----- -only supported when REJECTS_TABLE is set to a table name - -# Should not work without rejects_recovery_columns as list -statement error -SELECT SUM(COL1) + SUM(COL3) FROM read_csv( - 'test/sql/copy/csv/data/error/mismatch/part*.csv', - header=true, - columns = {COL3 :'INTEGER', COL1: 'INTEGER', COL2: 'VARCHAR'}, - rejects_table='csv_rejects_table', - rejects_recovery_columns=['NON_EXISTING_COLUMN'], - ignore_errors=true -); ----- -CSV options could not be auto-detected. Consider setting parser options manually. - -# Should not work without rejects_recovery_columns as list -statement error -SELECT SUM(COL1) + SUM(COL3) FROM read_csv( - 'test/sql/copy/csv/data/error/mismatch/part*.csv', - header=true, - columns = {COL3 :'INTEGER', COL1: 'INTEGER', COL2: 'VARCHAR'}, - rejects_table='csv_rejects_table', - rejects_recovery_columns=['NON_EXISTING_COLUMN'], - ignore_errors=true, - auto_detect=false -); ----- -REJECTS_RECOVERY_COLUMNS: column "NON_EXISTING_COLUMN" not found - -# Basic test -query IIII rowsort -SELECT * FROM read_csv( - 'test/sql/copy/csv/data/error/mismatch/example.tsv', - sep='\t', - columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'id': 'INTEGER', 'count': 'INTEGER'}, - rejects_table='csv_rejects_table', - rejects_recovery_columns=['name', 'age'], - ignore_errors=true, - auto_detect = false -); ----- -alice 10 1 20 -charlie 7 3 30 - -query IIIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "recovery_columns", "error", regexp_replace("file", '\\', '/', 'g') -FROM csv_rejects_table; ----- -2 3 "count" NOT_A_NUMBER {'name': bobby, 'age': 12} Could not convert string 'NOT_A_NUMBER' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/example.tsv - -statement ok -DROP TABLE csv_rejects_table; - -# We should not prune columns that are part of the rejects_recovery_columns -query I -SELECT SUM(COL1) FROM read_csv( - 'test/sql/copy/csv/data/error/mismatch/part*.csv', - columns = {COL1: 'INTEGER', COL2: 'VARCHAR'}, - header=true, - rejects_table='csv_rejects_table', - rejects_recovery_columns=['COL2'], - ignore_errors=true, - auto_detect = false -); ----- -14799 - -query IIIIIII rowsort -SELECT "line", "column", "column_name", "parsed_value", "recovery_columns", "error", regexp_replace("file", '\\', '/', 'g') -FROM csv_rejects_table; ----- -2058 0 "COL1" B {'COL2': BAD1B} Could not convert string 'B' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/part3.csv -2325 0 "COL1" B {'COL2': BAD2B} Could not convert string 'B' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/part2.csv -3137 0 "COL1" B {'COL2': BAD1B} Could not convert string 'B' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/part1.csv -4058 0 "COL1" C {'COL2': BAD1C} Could not convert string 'C' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/part3.csv -4325 0 "COL1" C {'COL2': BAD2C} Could not convert string 'C' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/part2.csv -5137 0 "COL1" C {'COL2': BAD1C} Could not convert string 'C' to 'INTEGER' test/sql/copy/csv/data/error/mismatch/part1.csv - -statement ok -DROP TABLE csv_rejects_table; \ No newline at end of file diff --git a/test/sql/copy/csv/rejects/test_invalid_parameters.test b/test/sql/copy/csv/rejects/test_invalid_parameters.test index d403d0274948..5209960fef88 100644 --- a/test/sql/copy/csv/rejects/test_invalid_parameters.test +++ b/test/sql/copy/csv/rejects/test_invalid_parameters.test @@ -54,3 +54,60 @@ SELECT * FROM read_csv( rejects_limit=-1) ---- REJECTS_LIMIT: cannot be negative + +# Test invalid arguments +statement error +SELECT * FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/bad.csv', + ignore_errors=false, + rejects_table='csv_rejects_table' +) +---- +only supported when IGNORE_ERRORS is set to true + +statement error +SELECT * FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/bad.csv', + ignore_errors=true, + rejects_table='') +---- +REJECTS_TABLE option cannot be empty + +statement error +SELECT * FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/bad.csv', + ignore_errors=true, + rejects_table='csv_rejects_table', + union_by_name=true) +---- +UNION_BY_NAME is set to true + +statement error +SELECT * FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/bad.csv', + ignore_errors=true, + rejects_limit=10) +---- +REJECTS_LIMIT option is only supported when REJECTS_TABLE is set to a table name + +statement error +SELECT * FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/bad.csv', + ignore_errors=true, + rejects_table='csv_rejects_table', + rejects_limit=-1) +---- +REJECTS_LIMIT: cannot be negative + + +query III +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=3000, + rejects_table='csv_rejects_table', + ignore_errors=true, header = 0); +---- +VARCHAR VARCHAR 11048 + +statement ok +DROP TABLE csv_rejects_table; \ No newline at end of file From bcebeff2a8e5a0978c6527c73c616c501dc8624c Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 29 Feb 2024 08:46:47 -0300 Subject: [PATCH 010/147] Handle CSV Line Errors that fall over multiple buffers --- .../scanner/string_value_scanner.cpp | 28 ++++++++-- .../csv_scanner/string_value_scanner.hpp | 2 +- .../csv/rejects/csv_buffer_size_rejects.test | 55 +++++++++++++++++++ 3 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 test/sql/copy/csv/rejects/csv_buffer_size_rejects.test diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index daa56dd209e0..70da1ac6ee3a 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -21,7 +21,7 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m store_line_size(store_line_size_p), csv_file_scan(std::move(csv_file_scan_p)), lines_read(lines_read_p) { // Vector information D_ASSERT(number_of_columns > 0); - buffer_handles.push_back(buffer_handle); + buffer_handles[buffer_handle->buffer_idx] = buffer_handle; // Buffer Information buffer_ptr = buffer_handle->Ptr(); buffer_size = buffer_handle->actual_size; @@ -327,14 +327,27 @@ void StringValueResult::NullPaddingQuotedNewlineCheck() { string StringValueResult::ReconstructCurrentLine() { idx_t current_line_size = previous_line_start - pre_previous_line_start; string result; - result.resize(current_line_size - 1); if (previous_line_start.buffer_idx == pre_previous_line_start.buffer_idx) { + result.resize(current_line_size - 1); idx_t result_idx = 0; for (idx_t i = pre_previous_line_start.buffer_pos + 1; i < previous_line_start.buffer_pos; i++) { result[result_idx++] = buffer_ptr[i]; } } else { - throw InternalException("Oh no"); + result.resize(current_line_size); + if (buffer_handles.find(pre_previous_line_start.buffer_idx) == buffer_handles.end()) { + throw InternalException("CSV Buffer is not available to reconstruct CSV Line, please open an issue with " + "your query and dataset."); + } + idx_t result_idx = 0; + auto first_buffer = buffer_handles[pre_previous_line_start.buffer_idx]->Ptr(); + auto first_buffer_size = buffer_handles[pre_previous_line_start.buffer_idx]->actual_size; + for (idx_t i = pre_previous_line_start.buffer_pos + 1; i < first_buffer_size; i++) { + result[result_idx++] = first_buffer[i]; + } + for (idx_t i = 0; i < previous_line_start.buffer_pos; i++) { + result[result_idx++] = buffer_ptr[i]; + } } return result; } @@ -884,7 +897,6 @@ bool StringValueScanner::MoveToNextBuffer() { if (iterator.pos.buffer_pos >= cur_buffer_handle->actual_size) { previous_buffer_handle = cur_buffer_handle; cur_buffer_handle = buffer_manager->GetBuffer(++iterator.pos.buffer_idx); - result.buffer_handles.push_back(cur_buffer_handle); if (!cur_buffer_handle) { iterator.pos.buffer_idx--; buffer_handle_ptr = nullptr; @@ -914,6 +926,8 @@ bool StringValueScanner::MoveToNextBuffer() { } return false; } + result.buffer_handles[cur_buffer_handle->buffer_idx] = cur_buffer_handle; + iterator.pos.buffer_pos = 0; buffer_handle_ptr = cur_buffer_handle->Ptr(); // Handle overbuffer value @@ -1057,6 +1071,12 @@ void StringValueScanner::SetStart() { } } if (iterator.pos.buffer_pos == cur_buffer_handle->actual_size) { + // Propagate any errors + for (auto &error_vector : scan_finder->error_handler->errors) { + for (auto &error : error_vector.second) { + error_handler->Error(error); + } + } // If things go terribly wrong, we never loop indefinetly. iterator.pos.buffer_idx = scan_finder->iterator.pos.buffer_idx; iterator.pos.buffer_pos = scan_finder->iterator.pos.buffer_pos; diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 00e0bb80b2c2..d0e9124c81f2 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -102,7 +102,7 @@ class StringValueResult : public ScannerResult { idx_t chunk_col_id = 0; //! We must ensure that we keep the buffers alive until processing the query result - vector> buffer_handles; + unordered_map> buffer_handles; //! Requested size of buffers (i.e., either 32Mb or set by buffer_size parameter) idx_t requested_size; diff --git a/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test new file mode 100644 index 000000000000..f0de6714f96a --- /dev/null +++ b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test @@ -0,0 +1,55 @@ +# name: test/sql/copy/csv/rejects/csv_buffer_size_rejects.test +# description: Force CSV Lines from errors to fall mid-buffers +# group: [rejects] + +require skip_reload + +# FIXME: https://github.com/duckdb/duckdb/issues/7755 +require vector_size 2048 + +# Ensure that we can get the schema if we reduce the sample size and ignore errors +query IIIII +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + buffer_size=5, + rejects_table='csv_rejects_table', + ignore_errors=true); +---- +BIGINT VARCHAR 11044 11044 2 + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 0 "column0" CAST B, A 10875 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 0 "column0" CAST C, A 20875 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 0 "column0" CAST B, A 18395 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 0 "column0" CAST C, A 28395 + +query I +SELECT error_message +FROM csv_rejects_table where line=2176 and column_idx=0; +---- +:.*Could not convert string "B" to 'BIGINT'.* + +query I +SELECT error_message +FROM csv_rejects_table where line=4176 and column_idx=0; +---- +:.*Could not convert string "C" to 'BIGINT'.* + +query I +SELECT error_message +FROM csv_rejects_table where line=3680 and column_idx=0; +---- +:.*Could not convert string "B" to 'BIGINT'.* + +query I +SELECT error_message +FROM csv_rejects_table where line=5680 and column_idx=0; +---- +:.*Could not convert string "C" to 'BIGINT'.* + +statement ok +DROP TABLE csv_rejects_table; \ No newline at end of file From 8ab1a9a3d67a0192e517d931d333cd1547a52626 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 29 Feb 2024 09:11:06 -0300 Subject: [PATCH 011/147] rounding off minor details --- .../scanner/string_value_scanner.cpp | 26 +++++++++---------- .../csv_scanner/string_value_scanner.hpp | 3 +++ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 70da1ac6ee3a..4adf70bc3ac7 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -353,6 +353,18 @@ string StringValueResult::ReconstructCurrentLine() { } bool StringValueResult::AddRowInternal() { + LinePosition current_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, buffer_size}; + idx_t current_line_size = current_line_start - previous_line_start; + if (store_line_size) { + error_handler.NewMaxLineSize(current_line_size); + } + if (current_line_size > state_machine.options.maximum_line_size) { + LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), number_of_rows); + auto csv_error = CSVError::LineSizeError(state_machine.options, current_line_size, lines_per_batch); + error_handler.Error(csv_error); + } + pre_previous_line_start = previous_line_start; + previous_line_start = current_line_start; if (ignore_current_row) { // An error occurred on this row, we are ignoring it and resetting our control flag ignore_current_row = false; @@ -428,19 +440,6 @@ bool StringValueResult::AddRowInternal() { } bool StringValueResult::AddRow(StringValueResult &result, const idx_t buffer_pos) { - LinePosition current_line_start = {result.iterator.pos.buffer_idx, result.iterator.pos.buffer_pos, - result.buffer_size}; - idx_t current_line_size = current_line_start - result.previous_line_start; - if (result.store_line_size) { - result.error_handler.NewMaxLineSize(current_line_size); - } - if (current_line_size > result.state_machine.options.maximum_line_size) { - LinesPerBoundary lines_per_batch(result.iterator.GetBoundaryIdx(), result.number_of_rows); - auto csv_error = CSVError::LineSizeError(result.state_machine.options, current_line_size, lines_per_batch); - result.error_handler.Error(csv_error); - } - result.pre_previous_line_start = result.previous_line_start; - result.previous_line_start = current_line_start; if (result.last_position <= buffer_pos) { // We add the value if (result.quoted) { @@ -1082,6 +1081,7 @@ void StringValueScanner::SetStart() { iterator.pos.buffer_pos = scan_finder->iterator.pos.buffer_pos; result.last_position = iterator.pos.buffer_pos; iterator.done = scan_finder->iterator.done; + result.lines_read++; return; } } diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index d0e9124c81f2..74064587a621 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -43,6 +43,9 @@ class LinePosition { return other.buffer_size - other.buffer_pos + buffer_pos; } idx_t GetGlobalPosition(idx_t requested_buffer_size) { + if (buffer_pos == requested_buffer_size) { + return requested_buffer_size * buffer_idx + buffer_pos; + } return requested_buffer_size * buffer_idx + buffer_pos + 1; } idx_t buffer_pos = 0; From b6fd5674a628cb9234c984b3578af2563a31777a Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 29 Feb 2024 14:27:22 -0300 Subject: [PATCH 012/147] lots of adjustments to make the errors accurate for small buffer sizes --- .../scanner/string_value_scanner.cpp | 57 ++++++++++++------- .../csv_scanner/string_value_scanner.hpp | 9 +-- .../csv/rejects/csv_buffer_size_rejects.test | 28 +++++---- 3 files changed, 57 insertions(+), 37 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 4adf70bc3ac7..dcb58116595e 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -324,29 +324,35 @@ void StringValueResult::NullPaddingQuotedNewlineCheck() { } //! Reconstructs the current line to be used in error messages -string StringValueResult::ReconstructCurrentLine() { - idx_t current_line_size = previous_line_start - pre_previous_line_start; +string StringValueResult::ReconstructCurrentLine(bool &first_char_nl) { string result; if (previous_line_start.buffer_idx == pre_previous_line_start.buffer_idx) { - result.resize(current_line_size - 1); - idx_t result_idx = 0; - for (idx_t i = pre_previous_line_start.buffer_pos + 1; i < previous_line_start.buffer_pos; i++) { - result[result_idx++] = buffer_ptr[i]; + if (buffer_handles.find(previous_line_start.buffer_idx) == buffer_handles.end()) { + throw InternalException("CSV Buffer is not available to reconstruct CSV Line, please open an issue with " + "your query and dataset."); + } + auto buffer = buffer_handles[pre_previous_line_start.buffer_idx]->Ptr(); + first_char_nl = + buffer[pre_previous_line_start.buffer_pos] == '\n' || buffer[pre_previous_line_start.buffer_pos] == '\r'; + for (idx_t i = pre_previous_line_start.buffer_pos + first_char_nl; i < previous_line_start.buffer_pos; i++) { + result += buffer[i]; } } else { - result.resize(current_line_size); - if (buffer_handles.find(pre_previous_line_start.buffer_idx) == buffer_handles.end()) { + if (buffer_handles.find(pre_previous_line_start.buffer_idx) == buffer_handles.end() || + buffer_handles.find(previous_line_start.buffer_idx) == buffer_handles.end()) { throw InternalException("CSV Buffer is not available to reconstruct CSV Line, please open an issue with " "your query and dataset."); } - idx_t result_idx = 0; auto first_buffer = buffer_handles[pre_previous_line_start.buffer_idx]->Ptr(); auto first_buffer_size = buffer_handles[pre_previous_line_start.buffer_idx]->actual_size; - for (idx_t i = pre_previous_line_start.buffer_pos + 1; i < first_buffer_size; i++) { - result[result_idx++] = first_buffer[i]; + auto second_buffer = buffer_handles[previous_line_start.buffer_idx]->Ptr(); + first_char_nl = first_buffer[pre_previous_line_start.buffer_pos] == '\n' || + first_buffer[pre_previous_line_start.buffer_pos] == '\r'; + for (idx_t i = pre_previous_line_start.buffer_pos + first_char_nl; i < first_buffer_size; i++) { + result += first_buffer[i]; } for (idx_t i = 0; i < previous_line_start.buffer_pos; i++) { - result[result_idx++] = buffer_ptr[i]; + result += second_buffer[i]; } } return result; @@ -379,10 +385,11 @@ bool StringValueResult::AddRowInternal() { << LogicalTypeIdToString(parse_types[cast_error.first]) << "\'"; auto error_string = error.str(); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - 1); - auto borked_line = ReconstructCurrentLine(); + bool first_nl; + auto borked_line = ReconstructCurrentLine(first_nl); auto csv_error = CSVError::CastError(state_machine.options, names[cast_error.first], error_string, cast_error.first, borked_line, lines_per_batch, - pre_previous_line_start.GetGlobalPosition(requested_size)); + pre_previous_line_start.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); } // If we got here it means we are ignoring errors, hence we need to signify to our result scanner to ignore this @@ -1069,23 +1076,35 @@ void StringValueScanner::SetStart() { return; } } - if (iterator.pos.buffer_pos == cur_buffer_handle->actual_size) { + if (iterator.pos.buffer_pos == cur_buffer_handle->actual_size || + scan_finder->iterator.GetBufferIdx() >= iterator.GetBufferIdx()) { // Propagate any errors - for (auto &error_vector : scan_finder->error_handler->errors) { - for (auto &error : error_vector.second) { - error_handler->Error(error); + if (!scan_finder->error_handler->errors.empty()) { + for (auto &error_vector : scan_finder->error_handler->errors) { + for (auto &error : error_vector.second) { + error_handler->Error(error); + } } + result.lines_read++; } // If things go terribly wrong, we never loop indefinetly. iterator.pos.buffer_idx = scan_finder->iterator.pos.buffer_idx; iterator.pos.buffer_pos = scan_finder->iterator.pos.buffer_pos; result.last_position = iterator.pos.buffer_pos; iterator.done = scan_finder->iterator.done; - result.lines_read++; return; } } } while (!line_found); + // Propagate any errors + if (!scan_finder->error_handler->errors.empty()) { + for (auto &error_vector : scan_finder->error_handler->errors) { + for (auto &error : error_vector.second) { + error_handler->Error(error); + } + } + result.lines_read++; + } iterator.pos.buffer_idx = scan_finder->result.pre_previous_line_start.buffer_idx; iterator.pos.buffer_pos = scan_finder->result.pre_previous_line_start.buffer_pos; result.last_position = iterator.pos.buffer_pos; diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 74064587a621..84ef406b30aa 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -42,11 +42,8 @@ class LinePosition { } return other.buffer_size - other.buffer_pos + buffer_pos; } - idx_t GetGlobalPosition(idx_t requested_buffer_size) { - if (buffer_pos == requested_buffer_size) { - return requested_buffer_size * buffer_idx + buffer_pos; - } - return requested_buffer_size * buffer_idx + buffer_pos + 1; + idx_t GetGlobalPosition(idx_t requested_buffer_size, bool first_char_nl) { + return requested_buffer_size * buffer_idx + buffer_pos + first_char_nl; } idx_t buffer_pos = 0; idx_t buffer_size = 0; @@ -127,7 +124,7 @@ class StringValueResult : public ScannerResult { static inline bool EmptyLine(StringValueResult &result, const idx_t buffer_pos); inline bool AddRowInternal(); //! Reconstructs the current line to be used in error messages - string ReconstructCurrentLine(); + string ReconstructCurrentLine(bool &first_char_nl); void HandleOverLimitRows(); diff --git a/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test index f0de6714f96a..15461ee1c9f3 100644 --- a/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test +++ b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test @@ -7,49 +7,53 @@ require skip_reload # FIXME: https://github.com/duckdb/duckdb/issues/7755 require vector_size 2048 +loop buffer_size 5 10 + # Ensure that we can get the schema if we reduce the sample size and ignore errors query IIIII SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', sample_size=1, - buffer_size=5, + buffer_size=${buffer_size}, rejects_table='csv_rejects_table', ignore_errors=true); ---- BIGINT VARCHAR 11044 11044 2 -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +query IIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 0 "column0" CAST B, A 10875 -test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 0 "column0" CAST C, A 20875 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 0 "column0" CAST B, A 18395 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 0 "column0" CAST C, A 28395 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 0 "column0" CAST B, A 10875 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 0 "column0" CAST C, A 20875 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 0 "column0" CAST B, A 18395 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 0 "column0" CAST C, A 28395 query I SELECT error_message -FROM csv_rejects_table where line=2176 and column_idx=0; +FROM csv_rejects_table where byte_position = 10875; ---- :.*Could not convert string "B" to 'BIGINT'.* query I SELECT error_message -FROM csv_rejects_table where line=4176 and column_idx=0; +FROM csv_rejects_table where byte_position = 20875; ---- :.*Could not convert string "C" to 'BIGINT'.* query I SELECT error_message -FROM csv_rejects_table where line=3680 and column_idx=0; +FROM csv_rejects_table where byte_position = 18395; ---- :.*Could not convert string "B" to 'BIGINT'.* query I SELECT error_message -FROM csv_rejects_table where line=5680 and column_idx=0; +FROM csv_rejects_table where byte_position = 28395; ---- :.*Could not convert string "C" to 'BIGINT'.* statement ok -DROP TABLE csv_rejects_table; \ No newline at end of file +DROP TABLE csv_rejects_table; + +endloop \ No newline at end of file From 7ad39f20b9d0462b28a5f15993d0c95288d652fa Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 29 Feb 2024 14:50:35 -0300 Subject: [PATCH 013/147] When resetting the buffer_handle we might have to keep the last one --- .../operator/csv_scanner/scanner/string_value_scanner.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index dcb58116595e..da67d5f1bd4a 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -252,7 +252,15 @@ void StringValueResult::Reset() { for (auto &v : validity_mask) { v->SetAllValid(result_size); } + // We keep a reference to the buffer from our current iteration if it already exists + shared_ptr cur_buffer; + if (buffer_handles.find(iterator.GetBufferIdx()) != buffer_handles.end()) { + cur_buffer = buffer_handles[iterator.GetBufferIdx()]; + } buffer_handles.clear(); + if (cur_buffer) { + buffer_handles[cur_buffer->buffer_idx] = cur_buffer; + } } void StringValueResult::AddQuotedValue(StringValueResult &result, const idx_t buffer_pos) { From 2826f951f537bd1976bb154b86a1bc7c5a7961d8 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 29 Feb 2024 15:02:21 -0300 Subject: [PATCH 014/147] fix test --- test/sql/copy/csv/rejects/csv_rejects_read.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sql/copy/csv/rejects/csv_rejects_read.test b/test/sql/copy/csv/rejects/csv_rejects_read.test index 458e485ffc75..ab5b12db949a 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_read.test +++ b/test/sql/copy/csv/rejects/csv_rejects_read.test @@ -46,7 +46,7 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/error/mismatch/bad2.csv 1 2 "col2" CAST ,2,DDD, 1 +test/sql/copy/csv/data/error/mismatch/bad2.csv 1 2 "col2" CAST 1,2,DDD, 0 test/sql/copy/csv/data/error/mismatch/bad2.csv 3 0 "col0" CAST EEE,7,FFF, 16 test/sql/copy/csv/data/error/mismatch/bad2.csv 3 2 "col2" CAST EEE,7,FFF, 16 From f6496fc22504f5d3ba62476203e436f21661e1ae Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 29 Feb 2024 15:20:05 -0300 Subject: [PATCH 015/147] We only care about propagting errors if we are ignoring them, as weird as this sounds --- .../scanner/string_value_scanner.cpp | 4 +- .../parallel/csv_parallel_buffer_size.test | 90 +++++++++---------- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index da67d5f1bd4a..4785f54129b8 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -1087,7 +1087,7 @@ void StringValueScanner::SetStart() { if (iterator.pos.buffer_pos == cur_buffer_handle->actual_size || scan_finder->iterator.GetBufferIdx() >= iterator.GetBufferIdx()) { // Propagate any errors - if (!scan_finder->error_handler->errors.empty()) { + if (!scan_finder->error_handler->errors.empty() && state_machine->options.ignore_errors) { for (auto &error_vector : scan_finder->error_handler->errors) { for (auto &error : error_vector.second) { error_handler->Error(error); @@ -1105,7 +1105,7 @@ void StringValueScanner::SetStart() { } } while (!line_found); // Propagate any errors - if (!scan_finder->error_handler->errors.empty()) { + if (!scan_finder->error_handler->errors.empty() && state_machine->options.ignore_errors) { for (auto &error_vector : scan_finder->error_handler->errors) { for (auto &error : error_vector.second) { error_handler->Error(error); diff --git a/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test b/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test index d1c9d8ce5ee8..f6e1d7ada93a 100644 --- a/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test +++ b/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test @@ -7,51 +7,51 @@ statement ok PRAGMA verify_parallelism -query III -SELECT sum(a), sum(b), sum(c) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) ----- -111111111 51866 3195 - -query I -SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) ----- -111111111 - -query I -SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) ----- -111111111 - -query IIII -select * from read_csv('test/sql/copy/csv/data/test/multi_column_string.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=25) ----- -1 6370 371 p1 -10 214 465 p2 -100 2403 160 p3 -1000 1564 67 p4 -10000 10617 138 p5 -100000 430 181 p6 -1000000 1904 658 p7 -10000000 12845 370 p8 -100000000 15519 785 p9 - -query IIII -select * from read_csv('test/sql/copy/csv/data/test/multi_column_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=25) ----- -1 6370 371 p1 -10 214 465 p2 -100 2403 160 p3 -1000 1564 67 p4 -10000 10617 138 p5 -100000 430 181 p6 -1000000 1904 658 p7 -10000000 12845 370 p8 -100000000 15519 785 p9 - -query I -SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/new_line_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|') ----- -111 +#query III +#SELECT sum(a), sum(b), sum(c) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) +#---- +#111111111 51866 3195 +# +#query I +#SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) +#---- +#111111111 +# +#query I +#SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) +#---- +#111111111 +# +#query IIII +#select * from read_csv('test/sql/copy/csv/data/test/multi_column_string.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=25) +#---- +#1 6370 371 p1 +#10 214 465 p2 +#100 2403 160 p3 +#1000 1564 67 p4 +#10000 10617 138 p5 +#100000 430 181 p6 +#1000000 1904 658 p7 +#10000000 12845 370 p8 +#100000000 15519 785 p9 +# +#query IIII +#select * from read_csv('test/sql/copy/csv/data/test/multi_column_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=25) +#---- +#1 6370 371 p1 +#10 214 465 p2 +#100 2403 160 p3 +#1000 1564 67 p4 +#10000 10617 138 p5 +#100000 430 181 p6 +#1000000 1904 658 p7 +#10000000 12845 370 p8 +#100000000 15519 785 p9 +# +#query I +#SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/new_line_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|') +#---- +#111 query I SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/new_line_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=80) From 404c4da61b052555865554aaa89a441091cbb88f Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 29 Feb 2024 15:20:32 -0300 Subject: [PATCH 016/147] woopsie-doopsie --- .../parallel/csv_parallel_buffer_size.test | 90 +++++++++---------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test b/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test index f6e1d7ada93a..d1c9d8ce5ee8 100644 --- a/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test +++ b/test/sql/copy/csv/parallel/csv_parallel_buffer_size.test @@ -7,51 +7,51 @@ statement ok PRAGMA verify_parallelism -#query III -#SELECT sum(a), sum(b), sum(c) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) -#---- -#111111111 51866 3195 -# -#query I -#SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) -#---- -#111111111 -# -#query I -#SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) -#---- -#111111111 -# -#query IIII -#select * from read_csv('test/sql/copy/csv/data/test/multi_column_string.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=25) -#---- -#1 6370 371 p1 -#10 214 465 p2 -#100 2403 160 p3 -#1000 1564 67 p4 -#10000 10617 138 p5 -#100000 430 181 p6 -#1000000 1904 658 p7 -#10000000 12845 370 p8 -#100000000 15519 785 p9 -# -#query IIII -#select * from read_csv('test/sql/copy/csv/data/test/multi_column_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=25) -#---- -#1 6370 371 p1 -#10 214 465 p2 -#100 2403 160 p3 -#1000 1564 67 p4 -#10000 10617 138 p5 -#100000 430 181 p6 -#1000000 1904 658 p7 -#10000000 12845 370 p8 -#100000000 15519 785 p9 -# -#query I -#SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/new_line_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|') -#---- -#111 +query III +SELECT sum(a), sum(b), sum(c) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) +---- +111111111 51866 3195 + +query I +SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) +---- +111111111 + +query I +SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/multi_column_integer_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER'), auto_detect='true', delim = '|', buffer_size=30) +---- +111111111 + +query IIII +select * from read_csv('test/sql/copy/csv/data/test/multi_column_string.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=25) +---- +1 6370 371 p1 +10 214 465 p2 +100 2403 160 p3 +1000 1564 67 p4 +10000 10617 138 p5 +100000 430 181 p6 +1000000 1904 658 p7 +10000000 12845 370 p8 +100000000 15519 785 p9 + +query IIII +select * from read_csv('test/sql/copy/csv/data/test/multi_column_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=25) +---- +1 6370 371 p1 +10 214 465 p2 +100 2403 160 p3 +1000 1564 67 p4 +10000 10617 138 p5 +100000 430 181 p6 +1000000 1904 658 p7 +10000000 12845 370 p8 +100000000 15519 785 p9 + +query I +SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/new_line_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|') +---- +111 query I SELECT sum(a) FROM read_csv('test/sql/copy/csv/data/test/new_line_string_rn.csv', COLUMNS=STRUCT_PACK(a := 'INTEGER', b := 'INTEGER', c := 'INTEGER', d := 'VARCHAR'), auto_detect='true', delim = '|', buffer_size=80) From ecf76d42a23c69d385b70b8b3a05001636242b14 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 1 Mar 2024 08:49:03 -0300 Subject: [PATCH 017/147] wip on rejects from flush cast --- data/csv/error/flush_cast.csv | 2814 +++++++++++++++++ .../csv/rejects/csv_rejects_flush_cast.test | 35 + 2 files changed, 2849 insertions(+) create mode 100644 data/csv/error/flush_cast.csv create mode 100644 test/sql/copy/csv/rejects/csv_rejects_flush_cast.test diff --git a/data/csv/error/flush_cast.csv b/data/csv/error/flush_cast.csv new file mode 100644 index 000000000000..33a2cc1af7be --- /dev/null +++ b/data/csv/error/flush_cast.csv @@ -0,0 +1,2814 @@ +a,b +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +B, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +25-09-2001, bla +c, bla +25-09-2001, bla diff --git a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test new file mode 100644 index 000000000000..05c9af1cb02d --- /dev/null +++ b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test @@ -0,0 +1,35 @@ +# name: test/sql/copy/csv/rejects/csv_rejects_flush_cast.test +# description: Test that Flush Cast functions properly for the rejects tables +# group: [rejects] + +require skip_reload + +# FIXME: https://github.com/duckdb/duckdb/issues/7755 +require vector_size 2048 + +query III +SELECT typeof(first(a)), typeof(first(b)), COUNT(*) FROM read_csv( + 'data/csv/error/flush_cast.csv', + columns = {'a': 'DATE', 'b': 'VARCHAR'}, + rejects_table='csv_rejects_table', + delim = ',', + dateformat = '%d-%m-%Y', + ignore_errors=true); +---- +DATE VARCHAR 2811 + + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +test/sql/copy/csv/data/error/mismatch/bad.csv 2 1 "col1" CAST 4,BBB,9, 9 + +query I +SELECT error_message +FROM csv_rejects_table; +---- +:.*Could not convert string "BBB" to 'INTEGER'.* + +statement ok +DROP TABLE csv_rejects_table; \ No newline at end of file From 53f612bbe056c3f12c7ab0c3256d8130a64b38a9 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 1 Mar 2024 10:27:51 -0300 Subject: [PATCH 018/147] introduce FullLinePosition --- .../scanner/string_value_scanner.cpp | 59 ++++++++++--------- .../csv_scanner/string_value_scanner.hpp | 21 +++++-- 2 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 4785f54129b8..516bf437edbb 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -29,8 +29,8 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m requested_size = buffer_handle->requested_size; // Current Result information - previous_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, buffer_handle->actual_size}; - pre_previous_line_start = previous_line_start; + current_line_position.begin = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, buffer_handle->actual_size}; + current_line_position.end = current_line_position.begin; // Fill out Parse Types vector logical_types; parse_types = make_unsafe_uniq_array(number_of_columns); @@ -332,34 +332,33 @@ void StringValueResult::NullPaddingQuotedNewlineCheck() { } //! Reconstructs the current line to be used in error messages -string StringValueResult::ReconstructCurrentLine(bool &first_char_nl) { +string FullLinePosition::ReconstructCurrentLine(bool &first_char_nl, + unordered_map> &buffer_handles) { string result; - if (previous_line_start.buffer_idx == pre_previous_line_start.buffer_idx) { - if (buffer_handles.find(previous_line_start.buffer_idx) == buffer_handles.end()) { + if (end.buffer_idx == begin.buffer_idx) { + if (buffer_handles.find(end.buffer_idx) == buffer_handles.end()) { throw InternalException("CSV Buffer is not available to reconstruct CSV Line, please open an issue with " "your query and dataset."); } - auto buffer = buffer_handles[pre_previous_line_start.buffer_idx]->Ptr(); - first_char_nl = - buffer[pre_previous_line_start.buffer_pos] == '\n' || buffer[pre_previous_line_start.buffer_pos] == '\r'; - for (idx_t i = pre_previous_line_start.buffer_pos + first_char_nl; i < previous_line_start.buffer_pos; i++) { + auto buffer = buffer_handles[begin.buffer_idx]->Ptr(); + first_char_nl = buffer[begin.buffer_pos] == '\n' || buffer[begin.buffer_pos] == '\r'; + for (idx_t i = begin.buffer_pos + first_char_nl; i < end.buffer_pos; i++) { result += buffer[i]; } } else { - if (buffer_handles.find(pre_previous_line_start.buffer_idx) == buffer_handles.end() || - buffer_handles.find(previous_line_start.buffer_idx) == buffer_handles.end()) { + if (buffer_handles.find(begin.buffer_idx) == buffer_handles.end() || + buffer_handles.find(end.buffer_idx) == buffer_handles.end()) { throw InternalException("CSV Buffer is not available to reconstruct CSV Line, please open an issue with " "your query and dataset."); } - auto first_buffer = buffer_handles[pre_previous_line_start.buffer_idx]->Ptr(); - auto first_buffer_size = buffer_handles[pre_previous_line_start.buffer_idx]->actual_size; - auto second_buffer = buffer_handles[previous_line_start.buffer_idx]->Ptr(); - first_char_nl = first_buffer[pre_previous_line_start.buffer_pos] == '\n' || - first_buffer[pre_previous_line_start.buffer_pos] == '\r'; - for (idx_t i = pre_previous_line_start.buffer_pos + first_char_nl; i < first_buffer_size; i++) { + auto first_buffer = buffer_handles[begin.buffer_idx]->Ptr(); + auto first_buffer_size = buffer_handles[begin.buffer_idx]->actual_size; + auto second_buffer = buffer_handles[end.buffer_idx]->Ptr(); + first_char_nl = first_buffer[begin.buffer_pos] == '\n' || first_buffer[begin.buffer_pos] == '\r'; + for (idx_t i = begin.buffer_pos + first_char_nl; i < first_buffer_size; i++) { result += first_buffer[i]; } - for (idx_t i = 0; i < previous_line_start.buffer_pos; i++) { + for (idx_t i = 0; i < end.buffer_pos; i++) { result += second_buffer[i]; } } @@ -368,7 +367,7 @@ string StringValueResult::ReconstructCurrentLine(bool &first_char_nl) { bool StringValueResult::AddRowInternal() { LinePosition current_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, buffer_size}; - idx_t current_line_size = current_line_start - previous_line_start; + idx_t current_line_size = current_line_start - current_line_position.end; if (store_line_size) { error_handler.NewMaxLineSize(current_line_size); } @@ -377,8 +376,8 @@ bool StringValueResult::AddRowInternal() { auto csv_error = CSVError::LineSizeError(state_machine.options, current_line_size, lines_per_batch); error_handler.Error(csv_error); } - pre_previous_line_start = previous_line_start; - previous_line_start = current_line_start; + current_line_position.begin = current_line_position.end; + current_line_position.end = current_line_start; if (ignore_current_row) { // An error occurred on this row, we are ignoring it and resetting our control flag ignore_current_row = false; @@ -394,10 +393,10 @@ bool StringValueResult::AddRowInternal() { auto error_string = error.str(); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - 1); bool first_nl; - auto borked_line = ReconstructCurrentLine(first_nl); - auto csv_error = CSVError::CastError(state_machine.options, names[cast_error.first], error_string, - cast_error.first, borked_line, lines_per_batch, - pre_previous_line_start.GetGlobalPosition(requested_size, first_nl)); + auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); + auto csv_error = CSVError::CastError( + state_machine.options, names[cast_error.first], error_string, cast_error.first, borked_line, + lines_per_batch, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); } // If we got here it means we are ignoring errors, hence we need to signify to our result scanner to ignore this @@ -444,6 +443,7 @@ bool StringValueResult::AddRowInternal() { number_of_rows--; } } + line_positions_per_row[number_of_rows] = current_line_position; cur_col_id = 0; chunk_col_id = 0; number_of_rows++; @@ -699,9 +699,10 @@ void StringValueScanner::Initialize() { SetStart(); } result.last_position = iterator.pos.buffer_pos; - result.previous_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, cur_buffer_handle->actual_size}; + result.current_line_position.begin = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, + cur_buffer_handle->actual_size}; - result.pre_previous_line_start = result.previous_line_start; + result.current_line_position.end = result.current_line_position.begin; } void StringValueScanner::ProcessExtraRow() { @@ -1113,8 +1114,8 @@ void StringValueScanner::SetStart() { } result.lines_read++; } - iterator.pos.buffer_idx = scan_finder->result.pre_previous_line_start.buffer_idx; - iterator.pos.buffer_pos = scan_finder->result.pre_previous_line_start.buffer_pos; + iterator.pos.buffer_idx = scan_finder->result.current_line_position.begin.buffer_idx; + iterator.pos.buffer_pos = scan_finder->result.current_line_position.begin.buffer_pos; result.last_position = iterator.pos.buffer_pos; } diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 84ef406b30aa..3c7ba2a28660 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -50,6 +50,18 @@ class LinePosition { idx_t buffer_idx = 0; }; +//! Keeps track of start and end of line positions in regard to the CSV file +class FullLinePosition { +public: + FullLinePosition() {}; + LinePosition begin; + LinePosition end; + + //! Reconstructs the current line to be used in error messages + string ReconstructCurrentLine(bool &first_char_nl, + unordered_map> &buffer_handles); +}; + class StringValueResult : public ScannerResult { public: StringValueResult(CSVStates &states, CSVStateMachine &state_machine, @@ -83,9 +95,10 @@ class StringValueResult : public ScannerResult { //! Information to properly handle errors CSVErrorHandler &error_handler; CSVIterator &iterator; - //! Where the previous line started, used to validate the maximum_line_size option - LinePosition previous_line_start; - LinePosition pre_previous_line_start; + //! Line position of the current line + FullLinePosition current_line_position; + //! Used for CSV line reconstruction on flushed errors + unordered_map line_positions_per_row; bool store_line_size = false; bool added_last_line = false; bool quoted_new_line = false; @@ -123,8 +136,6 @@ class StringValueResult : public ScannerResult { //! Handles EmptyLine states static inline bool EmptyLine(StringValueResult &result, const idx_t buffer_pos); inline bool AddRowInternal(); - //! Reconstructs the current line to be used in error messages - string ReconstructCurrentLine(bool &first_char_nl); void HandleOverLimitRows(); From 4403d16353b3cddd7995408a4c049af2a3d743ef Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 1 Mar 2024 10:36:20 -0300 Subject: [PATCH 019/147] Errors during flush being properly propagated --- .../scanner/string_value_scanner.cpp | 22 +++++++++++++------ .../csv/rejects/csv_rejects_flush_cast.test | 15 +++++++++---- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 516bf437edbb..d08a4a06f009 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -649,10 +649,13 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { } LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - parse_chunk.size() + line_error); - // auto borked_line = result.ReconstructCurrentLine(); - string empty; - auto csv_error = CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], - error_message, col_idx, empty, lines_per_batch, 0); + bool first_nl; + auto borked_line = + result.line_positions_per_row[line_error].ReconstructCurrentLine(first_nl, result.buffer_handles); + auto csv_error = CSVError::CastError( + state_machine->options, csv_file_scan->names[col_idx], error_message, col_idx, borked_line, + lines_per_batch, + result.line_positions_per_row[line_error].begin.GetGlobalPosition(result.result_size, first_nl)); error_handler->Error(csv_error); } borked_lines.insert(line_error++); @@ -668,9 +671,14 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { } LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read - parse_chunk.size() + line_error); - string empty; - auto csv_error = CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], - error_message, col_idx, empty, lines_per_batch, 0); + bool first_nl; + auto borked_line = result.line_positions_per_row[line_error].ReconstructCurrentLine( + first_nl, result.buffer_handles); + auto csv_error = + CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], error_message, + col_idx, borked_line, lines_per_batch, + result.line_positions_per_row[line_error].begin.GetGlobalPosition( + result.result_size, first_nl)); error_handler->Error(csv_error); } diff --git a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test index 05c9af1cb02d..87768b805d8a 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test +++ b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test @@ -21,15 +21,22 @@ DATE VARCHAR 2811 query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +FROM csv_rejects_table order by all; ---- -test/sql/copy/csv/data/error/mismatch/bad.csv 2 1 "col1" CAST 4,BBB,9, 9 +data/csv/error/flush_cast.csv 2813 0 "a" CAST c, bla 44971 +data/csv/error/flush_cast.csv 439 0 "a" CAST B, bla 6996 query I SELECT error_message -FROM csv_rejects_table; +FROM csv_rejects_table where byte_position = 6996; ---- -:.*Could not convert string "BBB" to 'INTEGER'.* +:.*Could not parse string "B" according to format specifier "%d-%m-%Y".* + +query I +SELECT error_message +FROM csv_rejects_table where byte_position = 44971; +---- +:.*Could not parse string "c" according to format specifier "%d-%m-%Y".* statement ok DROP TABLE csv_rejects_table; \ No newline at end of file From 7129aaf150e0889c975ff29a27270bfc4151567f Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 1 Mar 2024 12:02:42 -0300 Subject: [PATCH 020/147] preparing the ground for the other errors --- .../scanner/string_value_scanner.cpp | 29 ++++++++++++++----- .../operator/csv_scanner/util/csv_error.cpp | 19 ++++++------ .../operator/csv_scanner/csv_error.hpp | 12 ++++---- 3 files changed, 37 insertions(+), 23 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index d08a4a06f009..3852cd338d17 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -307,8 +307,11 @@ void StringValueResult::AddValue(StringValueResult &result, const idx_t buffer_p void StringValueResult::HandleOverLimitRows() { LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), number_of_rows + 1); - auto csv_error = CSVError::IncorrectColumnAmountError(state_machine.options, nullptr, number_of_columns, - cur_col_id + 1, lines_per_batch); + bool first_nl; + auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); + auto csv_error = + CSVError::IncorrectColumnAmountError(state_machine.options, cur_col_id + 1, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); // If we get here we need to remove the last line cur_col_id = 0; @@ -372,8 +375,12 @@ bool StringValueResult::AddRowInternal() { error_handler.NewMaxLineSize(current_line_size); } if (current_line_size > state_machine.options.maximum_line_size) { + bool first_nl; + auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), number_of_rows); - auto csv_error = CSVError::LineSizeError(state_machine.options, current_line_size, lines_per_batch); + auto csv_error = + CSVError::LineSizeError(state_machine.options, current_line_size, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); } current_line_position.begin = current_line_position.end; @@ -435,9 +442,12 @@ bool StringValueResult::AddRowInternal() { } } else { // If we are not null-padding this is an error + bool first_nl; + auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), number_of_rows + 1); - auto csv_error = CSVError::IncorrectColumnAmountError(state_machine.options, nullptr, number_of_columns, - cur_col_id, lines_per_batch); + auto csv_error = CSVError::IncorrectColumnAmountError( + state_machine.options, cur_col_id, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); // If we are here we ignore_errors, so we delete this line number_of_rows--; @@ -481,9 +491,12 @@ bool StringValueResult::AddRow(StringValueResult &result, const idx_t buffer_pos void StringValueResult::InvalidState(StringValueResult &result) { // FIXME: How do we recover from an invalid state? Can we restart the state machine and jump to the next row? LinesPerBoundary lines_per_batch(result.iterator.GetBoundaryIdx(), result.number_of_rows); - auto csv_error = CSVError::UnterminatedQuotesError(result.state_machine.options, - static_cast(result.vector_ptr[result.chunk_col_id]), - result.number_of_rows, result.cur_col_id, lines_per_batch); + bool first_nl; + auto borked_line = result.current_line_position.ReconstructCurrentLine(first_nl, result.buffer_handles); + + auto csv_error = CSVError::UnterminatedQuotesError( + result.state_machine.options, result.cur_col_id, lines_per_batch, borked_line, + result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl)); result.error_handler.Error(csv_error); } diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index b77437c88f6b..3174ff2d7723 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -115,12 +115,13 @@ CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_nam return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, csv_row, error_info, byte_position); } -CSVError CSVError::LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info) { +CSVError CSVError::LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info, + string &csv_row, idx_t byte_position) { std::ostringstream error; error << "Maximum line size of " << options.maximum_line_size << " bytes exceeded. "; error << "Actual Size:" << actual_size << " bytes." << std::endl; error << options.ToString(); - return CSVError(error.str(), CSVErrorType::MAXIMUM_LINE_SIZE, error_info); + return CSVError(error.str(), CSVErrorType::MAXIMUM_LINE_SIZE, 0, csv_row, error_info, byte_position); } CSVError CSVError::SniffingError(string &file_path) { @@ -141,26 +142,26 @@ CSVError CSVError::NullPaddingFail(const CSVReaderOptions &options, LinesPerBoun return CSVError(error.str(), CSVErrorType::NULLPADDED_QUOTED_NEW_VALUE, error_info); } -CSVError CSVError::UnterminatedQuotesError(const CSVReaderOptions &options, string_t *vector_ptr, - idx_t vector_line_start, idx_t current_column, LinesPerBoundary error_info) { +CSVError CSVError::UnterminatedQuotesError(const CSVReaderOptions &options, idx_t current_column, + LinesPerBoundary error_info, string &csv_row, idx_t byte_position) { std::ostringstream error; error << "Value with unterminated quote found." << std::endl; error << std::endl; // What were the options error << options.ToString(); - return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, error_info); + return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, current_column, csv_row, error_info, byte_position); } -CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, string_t *vector_ptr, - idx_t vector_line_start, idx_t actual_columns, - LinesPerBoundary error_info) { +CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, idx_t actual_columns, + LinesPerBoundary error_info, string &csv_row, idx_t byte_position) { std::ostringstream error; // How many columns were expected and how many were found error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns << std::endl; // What were the options error << options.ToString(); - return CSVError(error.str(), CSVErrorType::INCORRECT_COLUMN_AMOUNT, error_info); + return CSVError(error.str(), CSVErrorType::INCORRECT_COLUMN_AMOUNT, actual_columns, csv_row, error_info, + byte_position); } bool CSVErrorHandler::PrintLineNumber(CSVError &error) { diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index 44bd4f25913a..d8f755600e95 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -59,18 +59,18 @@ class CSVError { static CSVError CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, idx_t column_idx, string &csv_row, LinesPerBoundary error_info, idx_t byte_position); //! Produces error for when the line size exceeds the maximum line size option - static CSVError LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info); + static CSVError LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info, + string &csv_row, idx_t byte_position); //! Produces error for when the sniffer couldn't find viable options static CSVError SniffingError(string &file_path); //! Produces error messages for unterminated quoted values - static CSVError UnterminatedQuotesError(const CSVReaderOptions &options, string_t *vector_ptr, - idx_t vector_line_start, idx_t current_column, LinesPerBoundary error_info); + static CSVError UnterminatedQuotesError(const CSVReaderOptions &options, idx_t current_column, + LinesPerBoundary error_info, string &csv_row, idx_t byte_position); //! Produces error messages for null_padding option is set and we have quoted new values in parallel static CSVError NullPaddingFail(const CSVReaderOptions &options, LinesPerBoundary error_info); //! Produces error for incorrect (e.g., smaller and lower than the predefined) number of columns in a CSV Line - static CSVError IncorrectColumnAmountError(const CSVReaderOptions &options, string_t *vector_ptr, - idx_t vector_line_start, idx_t actual_columns, - LinesPerBoundary error_info); + static CSVError IncorrectColumnAmountError(const CSVReaderOptions &options, idx_t actual_columns, + LinesPerBoundary error_info, string &csv_row, idx_t byte_position); idx_t GetBoundaryIndex() { return error_info.boundary_idx; } From 7cecc4b983a9e1830abb8a77aee5bf7c97f5c11c Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 6 Mar 2024 11:02:10 +0100 Subject: [PATCH 021/147] All rejects tests pass with vector_size=2 --- .../duckdb/execution/operator/csv_scanner/csv_error.hpp | 2 +- test/sql/copy/csv/rejects/csv_buffer_size_rejects.test | 3 --- .../csv/rejects/csv_incorrect_columns_amount_rejects.test | 0 test/sql/copy/csv/rejects/csv_rejects_auto.test | 3 --- test/sql/copy/csv/rejects/csv_rejects_flush_cast.test | 5 ----- test/sql/copy/csv/rejects/csv_rejects_read.test | 4 ---- test/sql/copy/csv/rejects/test_invalid_parameters.test | 3 --- 7 files changed, 1 insertion(+), 19 deletions(-) create mode 100644 test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index d8f755600e95..9331d62c34f5 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -43,7 +43,7 @@ enum CSVErrorType : uint8_t { UNTERMINATED_QUOTES = 3, // If a quote is not terminated SNIFFING = 4, // If something went wrong during sniffing and was not possible to find suitable candidates MAXIMUM_LINE_SIZE = 5, // Maximum line size was exceeded by a line in the CSV File - NULLPADDED_QUOTED_NEW_VALUE = 6, // If the null_padding option is set and we have quoted new values in parallel + NULLPADDED_QUOTED_NEW_VALUE = 6, // If the null_padding option is set, and we have quoted new values in parallel }; diff --git a/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test index 15461ee1c9f3..dcef91e814ee 100644 --- a/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test +++ b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test @@ -4,9 +4,6 @@ require skip_reload -# FIXME: https://github.com/duckdb/duckdb/issues/7755 -require vector_size 2048 - loop buffer_size 5 10 # Ensure that we can get the schema if we reduce the sample size and ignore errors diff --git a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/test/sql/copy/csv/rejects/csv_rejects_auto.test b/test/sql/copy/csv/rejects/csv_rejects_auto.test index 887bd282db73..841ed42465f3 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_auto.test +++ b/test/sql/copy/csv/rejects/csv_rejects_auto.test @@ -3,9 +3,6 @@ require skip_reload -# FIXME: https://github.com/duckdb/duckdb/issues/7755 -require vector_size 2048 - # Ensure that we can get the schema if we reduce the sample size and ignore errors query IIIII SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( diff --git a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test index 87768b805d8a..6b2f5e59d7f5 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test +++ b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test @@ -2,11 +2,6 @@ # description: Test that Flush Cast functions properly for the rejects tables # group: [rejects] -require skip_reload - -# FIXME: https://github.com/duckdb/duckdb/issues/7755 -require vector_size 2048 - query III SELECT typeof(first(a)), typeof(first(b)), COUNT(*) FROM read_csv( 'data/csv/error/flush_cast.csv', diff --git a/test/sql/copy/csv/rejects/csv_rejects_read.test b/test/sql/copy/csv/rejects/csv_rejects_read.test index ab5b12db949a..a0b2e751289d 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_read.test +++ b/test/sql/copy/csv/rejects/csv_rejects_read.test @@ -3,10 +3,6 @@ require skip_reload -# FIXME: https://github.com/duckdb/duckdb/issues/7755 -require vector_size 2048 - - query III rowsort SELECT * FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/bad.csv', diff --git a/test/sql/copy/csv/rejects/test_invalid_parameters.test b/test/sql/copy/csv/rejects/test_invalid_parameters.test index 5209960fef88..2e343a30765d 100644 --- a/test/sql/copy/csv/rejects/test_invalid_parameters.test +++ b/test/sql/copy/csv/rejects/test_invalid_parameters.test @@ -3,9 +3,6 @@ require skip_reload -# FIXME: https://github.com/duckdb/duckdb/issues/7755 -require vector_size 2048 - # Test invalid arguments statement error SELECT * FROM read_csv( From 8222dd4f0888703e1ad2883671087e5e0976d239 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 6 Mar 2024 13:32:28 +0100 Subject: [PATCH 022/147] WIP on column amount incorrect --- .../rejects/incorrect_columns/few_columns.csv | 2773 +++++++++++++++++ .../incorrect_columns/many_columns.csv | 2773 +++++++++++++++++ .../rejects/incorrect_columns/mix_columns.csv | 2773 +++++++++++++++++ .../scanner/string_value_scanner.cpp | 4 +- .../table_function/global_csv_state.cpp | 36 +- .../operator/csv_scanner/util/csv_error.cpp | 12 +- .../operator/csv_scanner/csv_error.hpp | 13 +- .../csv_incorrect_columns_amount_rejects.test | 22 + 8 files changed, 8392 insertions(+), 14 deletions(-) create mode 100644 data/csv/rejects/incorrect_columns/few_columns.csv create mode 100644 data/csv/rejects/incorrect_columns/many_columns.csv create mode 100644 data/csv/rejects/incorrect_columns/mix_columns.csv diff --git a/data/csv/rejects/incorrect_columns/few_columns.csv b/data/csv/rejects/incorrect_columns/few_columns.csv new file mode 100644 index 000000000000..9b9d66a642d3 --- /dev/null +++ b/data/csv/rejects/incorrect_columns/few_columns.csv @@ -0,0 +1,2773 @@ +a,b,c,d +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 diff --git a/data/csv/rejects/incorrect_columns/many_columns.csv b/data/csv/rejects/incorrect_columns/many_columns.csv new file mode 100644 index 000000000000..e8611f730f05 --- /dev/null +++ b/data/csv/rejects/incorrect_columns/many_columns.csv @@ -0,0 +1,2773 @@ +a,b,c,d +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4,5,6 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4,5,6 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4,5 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4,5 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 diff --git a/data/csv/rejects/incorrect_columns/mix_columns.csv b/data/csv/rejects/incorrect_columns/mix_columns.csv new file mode 100644 index 000000000000..1217d34524cd --- /dev/null +++ b/data/csv/rejects/incorrect_columns/mix_columns.csv @@ -0,0 +1,2773 @@ +a,b,c,d +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4,5,6 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4,5,6 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 +1,2,3,4 diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 1d8c1ef2864e..a1ef8c54448d 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -308,7 +308,7 @@ void StringValueResult::AddValue(StringValueResult &result, const idx_t buffer_p } void StringValueResult::HandleOverLimitRows() { - LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), number_of_rows + 1); + LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read + 1); bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); auto csv_error = @@ -446,7 +446,7 @@ bool StringValueResult::AddRowInternal() { // If we are not null-padding this is an error bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); - LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), number_of_rows + 1); + LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read + 1); auto csv_error = CSVError::IncorrectColumnAmountError( state_machine.options, cur_col_id, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 77982d83fab2..73a3a2fe420c 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -137,6 +137,36 @@ void CSVGlobalState::DecrementThread() { } } +bool IsCSVErrorAcceptedReject(CSVErrorType type) { + switch (type) { + case CSVErrorType::CAST_ERROR: + case CSVErrorType::TOO_MANY_COLUMNS: + case CSVErrorType::TOO_FEW_COLUMNS: + case CSVErrorType::MAXIMUM_LINE_SIZE: + case CSVErrorType::UNTERMINATED_QUOTES: + return true; + default: + return false; + } +} + +string CSVErrorTypeToEnum(CSVErrorType type) { + switch (type) { + case CSVErrorType::CAST_ERROR: + return "CAST"; + case CSVErrorType::TOO_FEW_COLUMNS: + return "MISSING COLUMNS"; + case CSVErrorType::TOO_MANY_COLUMNS: + return "TOO MANY COLUMNS"; + case CSVErrorType::MAXIMUM_LINE_SIZE: + return "LINE SIZE OVER MAXIMUM"; + case CSVErrorType::UNTERMINATED_QUOTES: + return "UNQUOTED VALUE"; + default: + throw InternalException("CSV Error is not valid to be stored in a Rejects Table"); + } +} + void CSVGlobalState::FillRejectsTable() { auto &options = bind_data.options; @@ -152,7 +182,7 @@ void CSVGlobalState::FillRejectsTable() { auto &errors = file->error_handler->errors; for (auto &error_vector : errors) { for (auto &error : error_vector.second) { - if (error.type != CSVErrorType::CAST_ERROR) { + if (!IsCSVErrorAcceptedReject(error.type)) { // For now, we only will use it for casting errors continue; } @@ -177,8 +207,8 @@ void CSVGlobalState::FillRejectsTable() { appender.Append(col_idx); // 5. Column Name (If Applicable) appender.Append(string_t("\"" + col_name + "\"")); - // 6. Error Type (ENUM?) - appender.Append(string_t("CAST")); + // 6. Error Type + appender.Append(string_t(CSVErrorTypeToEnum(error.type))); // 7. Original CSV Line appender.Append(string_t(error.csv_row)); // 8. Full Error Message diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index 3174ff2d7723..af7d45864812 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -1,5 +1,6 @@ #include "duckdb/execution/operator/csv_scanner/csv_error.hpp" #include "duckdb/common/exception/conversion_exception.hpp" + #include namespace duckdb { @@ -160,8 +161,12 @@ CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, i << std::endl; // What were the options error << options.ToString(); - return CSVError(error.str(), CSVErrorType::INCORRECT_COLUMN_AMOUNT, actual_columns, csv_row, error_info, - byte_position); + if (actual_columns > options.dialect_options.num_cols) { + return CSVError(error.str(), CSVErrorType::TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, + byte_position); + } else { + return CSVError(error.str(), CSVErrorType::TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, byte_position); + } } bool CSVErrorHandler::PrintLineNumber(CSVError &error) { @@ -171,7 +176,8 @@ bool CSVErrorHandler::PrintLineNumber(CSVError &error) { switch (error.type) { case CSVErrorType::CAST_ERROR: case CSVErrorType::UNTERMINATED_QUOTES: - case CSVErrorType::INCORRECT_COLUMN_AMOUNT: + case CSVErrorType::TOO_FEW_COLUMNS: + case CSVErrorType::TOO_MANY_COLUMNS: case CSVErrorType::MAXIMUM_LINE_SIZE: case CSVErrorType::NULLPADDED_QUOTED_NEW_VALUE: return true; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index 9331d62c34f5..95436017f2a1 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -39,11 +39,12 @@ class LinesPerBoundary { enum CSVErrorType : uint8_t { CAST_ERROR = 0, // If when casting a value from string to the column type fails COLUMN_NAME_TYPE_MISMATCH = 1, // If there is a mismatch between Column Names and Types - INCORRECT_COLUMN_AMOUNT = 2, // If the CSV is missing a column - UNTERMINATED_QUOTES = 3, // If a quote is not terminated - SNIFFING = 4, // If something went wrong during sniffing and was not possible to find suitable candidates - MAXIMUM_LINE_SIZE = 5, // Maximum line size was exceeded by a line in the CSV File - NULLPADDED_QUOTED_NEW_VALUE = 6, // If the null_padding option is set, and we have quoted new values in parallel + TOO_FEW_COLUMNS = 2, // If the CSV has too few columns + TOO_MANY_COLUMNS = 3, // If the CSV has too many column + UNTERMINATED_QUOTES = 4, // If a quote is not terminated + SNIFFING = 5, // If something went wrong during sniffing and was not possible to find suitable candidates + MAXIMUM_LINE_SIZE = 6, // Maximum line size was exceeded by a line in the CSV File + NULLPADDED_QUOTED_NEW_VALUE = 7, // If the null_padding option is set, and we have quoted new values in parallel }; @@ -69,7 +70,7 @@ class CSVError { //! Produces error messages for null_padding option is set and we have quoted new values in parallel static CSVError NullPaddingFail(const CSVReaderOptions &options, LinesPerBoundary error_info); //! Produces error for incorrect (e.g., smaller and lower than the predefined) number of columns in a CSV Line - static CSVError IncorrectColumnAmountError(const CSVReaderOptions &options, idx_t actual_columns, + static CSVError IncorrectColumnAmountError(const CSVReaderOptions &state_machine, idx_t actual_columns, LinesPerBoundary error_info, string &csv_row, idx_t byte_position); idx_t GetBoundaryIndex() { return error_info.boundary_idx; diff --git a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test index e69de29bb2d1..7f899e860de0 100644 --- a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test +++ b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test @@ -0,0 +1,22 @@ +# name: test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test +# description: Test that incorrect column amounts return correct info as rejects tables +# group: [rejects] + +require skip_reload + + +statement ok +SELECT * FROM read_csv( + 'data/csv/rejects/incorrect_columns/few_columns.csv', + columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1); + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/incorrect_columns/few_columns.csv 1816 3 "d" MISSING COLUMNS 1,2,3 14504 +data/csv/rejects/incorrect_columns/few_columns.csv 1825 1 "b" MISSING COLUMNS 1 14574 +data/csv/rejects/incorrect_columns/few_columns.csv 2380 1 "b" MISSING COLUMNS 1 19008 +data/csv/rejects/incorrect_columns/few_columns.csv 2764 2 "c" MISSING COLUMNS 1,2 22074 \ No newline at end of file From d9ffcb922d214dd366f046517e81027dfacfd8c3 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 6 Mar 2024 13:42:03 +0100 Subject: [PATCH 023/147] remainnig merge --- src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index 1427b1e840c4..82160a8490d8 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -74,7 +74,6 @@ class CSVError { LinesPerBoundary error_info, string &csv_row, idx_t byte_position); static CSVError InvalidUTF8(const CSVReaderOptions &options, LinesPerBoundary error_info); - idx_t GetBoundaryIndex() { return error_info.boundary_idx; } From d54bf5b7c590803ea5707881325b9fc205ae4694 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 6 Mar 2024 13:47:22 +0100 Subject: [PATCH 024/147] line error fix --- .../operator/csv_scanner/scanner/string_value_scanner.cpp | 4 ++-- .../csv/rejects/csv_incorrect_columns_amount_rejects.test | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 6a7586ee5b4e..4d50b2ec7a8e 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -325,7 +325,7 @@ void StringValueResult::AddValue(StringValueResult &result, const idx_t buffer_p } void StringValueResult::HandleOverLimitRows() { - LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read + 1); + LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); auto csv_error = @@ -465,7 +465,7 @@ bool StringValueResult::AddRowInternal() { // If we are not null-padding this is an error bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); - LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read + 1); + LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); auto csv_error = CSVError::IncorrectColumnAmountError( state_machine.options, cur_col_id, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); diff --git a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test index 7f899e860de0..b7d50fca8b7a 100644 --- a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test +++ b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test @@ -16,7 +16,7 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -data/csv/rejects/incorrect_columns/few_columns.csv 1816 3 "d" MISSING COLUMNS 1,2,3 14504 -data/csv/rejects/incorrect_columns/few_columns.csv 1825 1 "b" MISSING COLUMNS 1 14574 -data/csv/rejects/incorrect_columns/few_columns.csv 2380 1 "b" MISSING COLUMNS 1 19008 -data/csv/rejects/incorrect_columns/few_columns.csv 2764 2 "c" MISSING COLUMNS 1,2 22074 \ No newline at end of file +data/csv/rejects/incorrect_columns/few_columns.csv 1814 3 "d" MISSING COLUMNS 1,2,3 14504 +data/csv/rejects/incorrect_columns/few_columns.csv 1823 1 "b" MISSING COLUMNS 1 14574 +data/csv/rejects/incorrect_columns/few_columns.csv 2378 1 "b" MISSING COLUMNS 1 19008 +data/csv/rejects/incorrect_columns/few_columns.csv 2762 2 "c" MISSING COLUMNS 1,2 22074 \ No newline at end of file From 67703ebf6062b66969d8d8794f8622358c2f3cd0 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 6 Mar 2024 14:04:45 +0100 Subject: [PATCH 025/147] Get information on too many columns right --- .../scanner/string_value_scanner.cpp | 16 ++++++++------ .../table_function/global_csv_state.cpp | 14 +++++++++--- .../csv_incorrect_columns_amount_rejects.test | 22 +++++++++++++++++-- 3 files changed, 40 insertions(+), 12 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 4d50b2ec7a8e..caf511b4a7c4 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -111,6 +111,10 @@ inline bool IsValueNull(const char *null_str_ptr, const char *value_ptr, const i } void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size, bool allocate) { + if (ignore_current_row) { + cur_col_id++; + return; + } if (cur_col_id >= number_of_columns) { bool error = true; if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) { @@ -118,12 +122,11 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size error = !IsValueNull(null_str_ptr, value_ptr, size); } if (error) { - HandleOverLimitRows(); + ignore_current_row = true; } - } - if (ignore_current_row) { return; } + if (projecting_columns) { if (!projected_columns[cur_col_id]) { cur_col_id++; @@ -332,10 +335,6 @@ void StringValueResult::HandleOverLimitRows() { CSVError::IncorrectColumnAmountError(state_machine.options, cur_col_id + 1, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); - // If we get here we need to remove the last line - cur_col_id = 0; - chunk_col_id = 0; - ignore_current_row = true; } void StringValueResult::QuotedNewLine(StringValueResult &result) { @@ -405,6 +404,9 @@ bool StringValueResult::AddRowInternal() { current_line_position.begin = current_line_position.end; current_line_position.end = current_line_start; if (ignore_current_row) { + if (cur_col_id >= number_of_columns) { + HandleOverLimitRows(); + } cur_col_id = 0; chunk_col_id = 0; // An error occurred on this row, we are ignoring it and resetting our control flag diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 8e5bc26f411c..e412e21f8c4c 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -194,7 +194,11 @@ void CSVGlobalState::FillRejectsTable() { rejects->count++; auto row_line = file->error_handler->GetLine(error.error_info); auto col_idx = error.column_idx; - auto col_name = bind_data.return_names[col_idx]; + string col_name; + if (error.type != CSVErrorType::TOO_MANY_COLUMNS){ + // Too many columns does not have a name, all other errors have + col_name = bind_data.return_names[col_idx]; + } // Add the row to the rejects table appender.BeginRow(); // 1. File Path @@ -203,10 +207,14 @@ void CSVGlobalState::FillRejectsTable() { appender.Append(row_line); // 3. Byte Position where error occurred appender.Append(error.byte_position); - // 4. Column Index (If Applicable) + // 4. Column Index appender.Append(col_idx); // 5. Column Name (If Applicable) - appender.Append(string_t("\"" + col_name + "\"")); + if (col_name.empty()){ + appender.Append(Value()); + } else { + appender.Append(string_t("\"" + col_name + "\"")); + } // 6. Error Type appender.Append(string_t(CSVErrorTypeToEnum(error.type))); // 7. Original CSV Line diff --git a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test index b7d50fca8b7a..f0ea6c9eda57 100644 --- a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test +++ b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test @@ -4,7 +4,6 @@ require skip_reload - statement ok SELECT * FROM read_csv( 'data/csv/rejects/incorrect_columns/few_columns.csv', @@ -19,4 +18,23 @@ FROM csv_rejects_table; data/csv/rejects/incorrect_columns/few_columns.csv 1814 3 "d" MISSING COLUMNS 1,2,3 14504 data/csv/rejects/incorrect_columns/few_columns.csv 1823 1 "b" MISSING COLUMNS 1 14574 data/csv/rejects/incorrect_columns/few_columns.csv 2378 1 "b" MISSING COLUMNS 1 19008 -data/csv/rejects/incorrect_columns/few_columns.csv 2762 2 "c" MISSING COLUMNS 1,2 22074 \ No newline at end of file +data/csv/rejects/incorrect_columns/few_columns.csv 2762 2 "c" MISSING COLUMNS 1,2 22074 + +statement ok +DROP TABLE csv_rejects_table; + +statement ok +SELECT * FROM read_csv( + 'data/csv/rejects/incorrect_columns/many_columns.csv', + columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1); + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/incorrect_columns/many_columns.csv 1096 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 8760 +data/csv/rejects/incorrect_columns/many_columns.csv 1159 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 9268 +data/csv/rejects/incorrect_columns/many_columns.csv 1206 5 NULL TOO MANY COLUMNS 1,2,3,4,5 9648 +data/csv/rejects/incorrect_columns/many_columns.csv 2769 5 NULL TOO MANY COLUMNS 1,2,3,4,5 22154 \ No newline at end of file From 12542a2cdbeed1bfbd868802bcafa6979db521f1 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 6 Mar 2024 14:10:52 +0100 Subject: [PATCH 026/147] More tests for different incorrect column amounts --- .../rejects/incorrect_columns/small_mix.csv | 5 ++ .../csv_incorrect_columns_amount_rejects.test | 73 ++++++++++++++++++- 2 files changed, 77 insertions(+), 1 deletion(-) create mode 100644 data/csv/rejects/incorrect_columns/small_mix.csv diff --git a/data/csv/rejects/incorrect_columns/small_mix.csv b/data/csv/rejects/incorrect_columns/small_mix.csv new file mode 100644 index 000000000000..1cfae5653bcd --- /dev/null +++ b/data/csv/rejects/incorrect_columns/small_mix.csv @@ -0,0 +1,5 @@ +a,b,c,d +1,2,3,4 +1,2,3,4,5 +1,2,3 +1,2,3,4 \ No newline at end of file diff --git a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test index f0ea6c9eda57..414e9fdd8594 100644 --- a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test +++ b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test @@ -37,4 +37,75 @@ FROM csv_rejects_table; data/csv/rejects/incorrect_columns/many_columns.csv 1096 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 8760 data/csv/rejects/incorrect_columns/many_columns.csv 1159 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 9268 data/csv/rejects/incorrect_columns/many_columns.csv 1206 5 NULL TOO MANY COLUMNS 1,2,3,4,5 9648 -data/csv/rejects/incorrect_columns/many_columns.csv 2769 5 NULL TOO MANY COLUMNS 1,2,3,4,5 22154 \ No newline at end of file +data/csv/rejects/incorrect_columns/many_columns.csv 2769 5 NULL TOO MANY COLUMNS 1,2,3,4,5 22154 + +statement ok +DROP TABLE csv_rejects_table; + +statement ok +SELECT * FROM read_csv( + 'data/csv/rejects/incorrect_columns/mix_columns.csv', + columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1); + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/incorrect_columns/mix_columns.csv 1604 1 "b" MISSING COLUMNS 1 12824 +data/csv/rejects/incorrect_columns/mix_columns.csv 1671 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 13354 +data/csv/rejects/incorrect_columns/mix_columns.csv 2751 2 "c" MISSING COLUMNS 1,2 21998 +data/csv/rejects/incorrect_columns/mix_columns.csv 2768 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 22130 + +# Different Buffer Sizes +loop buffer_size 10 15 + +statement ok +DROP TABLE csv_rejects_table; + +statement ok +SELECT * FROM read_csv( + 'data/csv/rejects/incorrect_columns/small_mix.csv', + columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1); + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/incorrect_columns/small_mix.csv 3 5 NULL TOO MANY COLUMNS 1,2,3,4,5 16 +data/csv/rejects/incorrect_columns/small_mix.csv 4 3 "d" MISSING COLUMNS 1,2,3 26 + +endloop + +# All files +statement ok +DROP TABLE csv_rejects_table; + +statement ok +SELECT * FROM read_csv( + 'data/csv/rejects/incorrect_columns/*.csv', + columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1); + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/incorrect_columns/few_columns.csv 1814 3 "d" MISSING COLUMNS 1,2,3 14504 +data/csv/rejects/incorrect_columns/few_columns.csv 1823 1 "b" MISSING COLUMNS 1 14574 +data/csv/rejects/incorrect_columns/few_columns.csv 2378 1 "b" MISSING COLUMNS 1 19008 +data/csv/rejects/incorrect_columns/few_columns.csv 2762 2 "c" MISSING COLUMNS 1,2 22074 +data/csv/rejects/incorrect_columns/many_columns.csv 1096 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 8760 +data/csv/rejects/incorrect_columns/many_columns.csv 1159 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 9268 +data/csv/rejects/incorrect_columns/many_columns.csv 1206 5 NULL TOO MANY COLUMNS 1,2,3,4,5 9648 +data/csv/rejects/incorrect_columns/many_columns.csv 2769 5 NULL TOO MANY COLUMNS 1,2,3,4,5 22154 +data/csv/rejects/incorrect_columns/mix_columns.csv 1604 1 "b" MISSING COLUMNS 1 12824 +data/csv/rejects/incorrect_columns/mix_columns.csv 1671 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 13354 +data/csv/rejects/incorrect_columns/mix_columns.csv 2751 2 "c" MISSING COLUMNS 1,2 21998 +data/csv/rejects/incorrect_columns/mix_columns.csv 2768 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 22130 +data/csv/rejects/incorrect_columns/small_mix.csv 3 5 NULL TOO MANY COLUMNS 1,2,3,4,5 16 +data/csv/rejects/incorrect_columns/small_mix.csv 4 3 "d" MISSING COLUMNS 1,2,3 26 From 8ea5f0a7d0255a3317a2e2152c29f0abaf42031b Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 6 Mar 2024 16:56:57 +0100 Subject: [PATCH 027/147] WIP on sanitizing invalid utfs and more on utf rejects tables --- .../scanner/string_value_scanner.cpp | 45 ++++++++++++++----- .../table_function/global_csv_state.cpp | 7 ++- .../operator/csv_scanner/util/csv_error.cpp | 5 ++- .../operator/persistent/csv_rejects_table.cpp | 3 +- .../operator/csv_scanner/csv_error.hpp | 3 +- .../csv_scanner/string_value_scanner.hpp | 16 ++++++- .../csv/rejects/test_invalid_utf_rejects.test | 15 +++++++ .../utf8proc/include/utf8proc_wrapper.hpp | 2 + third_party/utf8proc/utf8proc_wrapper.cpp | 33 ++++++++++++++ 9 files changed, 109 insertions(+), 20 deletions(-) create mode 100644 test/sql/copy/csv/rejects/test_invalid_utf_rejects.test diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index caf511b4a7c4..a87f43917e1b 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -111,7 +111,7 @@ inline bool IsValueNull(const char *null_str_ptr, const char *value_ptr, const i } void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size, bool allocate) { - if (ignore_current_row) { + if (current_error.is_set) { cur_col_id++; return; } @@ -122,7 +122,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size error = !IsValueNull(null_str_ptr, value_ptr, size); } if (error) { - ignore_current_row = true; + current_error = {CSVErrorType::TOO_MANY_COLUMNS}; } return; } @@ -221,11 +221,11 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size if (parse_types[chunk_col_id].second && !Utf8Proc::IsValid(value_ptr, UnsafeNumericCast(size))) { bool force_error = !state_machine.options.ignore_errors && sniffing; // Invalid unicode, we must error - LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); - auto csv_error = CSVError::InvalidUTF8(state_machine.options, lines_per_batch); - error_handler.Error(csv_error, force_error); + if (force_error) { + HandleUnicodeError(force_error); + } // If we got here, we are ingoring errors, hence we must ignore this line. - ignore_current_row = true; + current_error = {CSVErrorType::INVALID_UNICODE}; break; } if (allocate) { @@ -282,7 +282,7 @@ void StringValueResult::Reset() { if (cur_buffer) { buffer_handles[cur_buffer->buffer_idx] = cur_buffer; } - ignore_current_row = false; + current_error.Reset(); } void StringValueResult::AddQuotedValue(StringValueResult &result, const idx_t buffer_pos) { @@ -337,6 +337,20 @@ void StringValueResult::HandleOverLimitRows() { error_handler.Error(csv_error); } +void StringValueResult::HandleUnicodeError(bool force_error) { + bool first_nl; + auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); + // sanitize borked line + std::vector charArray(borked_line.begin(), borked_line.end()); + charArray.push_back('\0'); // Null-terminate the character array + Utf8Proc::MakeValid(&charArray[0], charArray.size()); + borked_line = {charArray.begin(), charArray.end() - 1}; + LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); + auto csv_error = CSVError::InvalidUTF8(state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); + error_handler.Error(csv_error, force_error); +} + void StringValueResult::QuotedNewLine(StringValueResult &result) { result.quoted_new_line = true; } @@ -403,14 +417,21 @@ bool StringValueResult::AddRowInternal() { } current_line_position.begin = current_line_position.end; current_line_position.end = current_line_start; - if (ignore_current_row) { - if (cur_col_id >= number_of_columns) { + if (current_error.is_set) { + switch (current_error.type) { + case CSVErrorType::TOO_MANY_COLUMNS: HandleOverLimitRows(); + break; + case CSVErrorType::INVALID_UNICODE: + HandleUnicodeError(); + break; + default: + InvalidInputException("CSV Error not allowed when inserting row"); } cur_col_id = 0; chunk_col_id = 0; // An error occurred on this row, we are ignoring it and resetting our control flag - ignore_current_row = false; + current_error.Reset(); return false; } if (!cast_errors.empty()) { @@ -766,14 +787,14 @@ void StringValueScanner::ProcessExtraRow() { return; case CSVState::RECORD_SEPARATOR: if (states.states[0] == CSVState::RECORD_SEPARATOR) { - lines_read++; result.EmptyLine(result, iterator.pos.buffer_pos); iterator.pos.buffer_pos++; + lines_read++; return; } else if (states.states[0] != CSVState::CARRIAGE_RETURN) { - lines_read++; result.AddRow(result, iterator.pos.buffer_pos); iterator.pos.buffer_pos++; + lines_read++; return; } lines_read++; diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index e412e21f8c4c..6380d2e24c6c 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -144,6 +144,7 @@ bool IsCSVErrorAcceptedReject(CSVErrorType type) { case CSVErrorType::TOO_FEW_COLUMNS: case CSVErrorType::MAXIMUM_LINE_SIZE: case CSVErrorType::UNTERMINATED_QUOTES: + case CSVErrorType::INVALID_UNICODE: return true; default: return false; @@ -162,6 +163,8 @@ string CSVErrorTypeToEnum(CSVErrorType type) { return "LINE SIZE OVER MAXIMUM"; case CSVErrorType::UNTERMINATED_QUOTES: return "UNQUOTED VALUE"; + case CSVErrorType::INVALID_UNICODE: + return "INVALID UNICODE"; default: throw InternalException("CSV Error is not valid to be stored in a Rejects Table"); } @@ -195,7 +198,7 @@ void CSVGlobalState::FillRejectsTable() { auto row_line = file->error_handler->GetLine(error.error_info); auto col_idx = error.column_idx; string col_name; - if (error.type != CSVErrorType::TOO_MANY_COLUMNS){ + if (error.type != CSVErrorType::TOO_MANY_COLUMNS) { // Too many columns does not have a name, all other errors have col_name = bind_data.return_names[col_idx]; } @@ -210,7 +213,7 @@ void CSVGlobalState::FillRejectsTable() { // 4. Column Index appender.Append(col_idx); // 5. Column Name (If Applicable) - if (col_name.empty()){ + if (col_name.empty()) { appender.Append(Value()); } else { appender.Append(string_t("\"" + col_name + "\"")); diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index 217bf0ac9fec..9c1eaffd394f 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -169,13 +169,14 @@ CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, i } } -CSVError CSVError::InvalidUTF8(const CSVReaderOptions &options, LinesPerBoundary error_info) { +CSVError CSVError::InvalidUTF8(const CSVReaderOptions &options, idx_t current_column, LinesPerBoundary error_info, + string &csv_row, idx_t byte_position) { std::ostringstream error; // How many columns were expected and how many were found error << "Invalid unicode (byte sequence mismatch) detected." << std::endl; // What were the options error << options.ToString(); - return CSVError(error.str(), CSVErrorType::INVALID_UNICODE, error_info); + return CSVError(error.str(), CSVErrorType::INVALID_UNICODE, current_column, csv_row, error_info, byte_position); } bool CSVErrorHandler::PrintLineNumber(CSVError &error) { diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index 3f2acf553f21..4140b5d3eac5 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -31,7 +31,8 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData order_errors.SetValue(2, "TOO MANY COLUMNS"); order_errors.SetValue(3, "UNQUOTED VALUE"); order_errors.SetValue(4, "LINE SIZE OVER MAXIMUM"); - LogicalType enum_type = LogicalType::ENUM(enum_name, order_errors, 5); + order_errors.SetValue(5, "INVALID UNICODE"); + LogicalType enum_type = LogicalType::ENUM(enum_name, order_errors, 6); auto type_info = make_uniq(enum_name, enum_type); type_info->temporary = true; type_info->on_conflict = OnCreateConflict::IGNORE_ON_CONFLICT; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index 82160a8490d8..3ced0619aa67 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -72,7 +72,8 @@ class CSVError { //! Produces error for incorrect (e.g., smaller and lower than the predefined) number of columns in a CSV Line static CSVError IncorrectColumnAmountError(const CSVReaderOptions &state_machine, idx_t actual_columns, LinesPerBoundary error_info, string &csv_row, idx_t byte_position); - static CSVError InvalidUTF8(const CSVReaderOptions &options, LinesPerBoundary error_info); + static CSVError InvalidUTF8(const CSVReaderOptions &options, idx_t current_column, LinesPerBoundary error_info, + string &csv_row, idx_t byte_position); idx_t GetBoundaryIndex() { return error_info.boundary_idx; diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index d88fc76e7390..18d1776b7a71 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -62,6 +62,17 @@ class FullLinePosition { unordered_map> &buffer_handles); }; +class CurrentError { +public: + CurrentError() : is_set(false) {}; + CurrentError(CSVErrorType type) : is_set(true), type(type) {}; + void Reset() { + is_set = false; + } + bool is_set; + CSVErrorType type; +}; + class StringValueResult : public ScannerResult { public: StringValueResult(CSVStates &states, CSVStateMachine &state_machine, @@ -120,8 +131,8 @@ class StringValueResult : public ScannerResult { //! Requested size of buffers (i.e., either 32Mb or set by buffer_size parameter) idx_t requested_size; - //! If the current row has an error, we have to skip it - bool ignore_current_row = false; + //! Current Error if any + CurrentError current_error; bool sniffing; //! Specialized code for quoted values, makes sure to remove quotes and escapes @@ -140,6 +151,7 @@ class StringValueResult : public ScannerResult { inline bool AddRowInternal(); void HandleOverLimitRows(); + void HandleUnicodeError(bool force_error = false); inline void AddValueToVector(const char *value_ptr, const idx_t size, bool allocate = false); diff --git a/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test new file mode 100644 index 000000000000..bcf1b6f64ecb --- /dev/null +++ b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test @@ -0,0 +1,15 @@ +# name: test/sql/copy/csv/rejects/test_invalid_utf_rejects.test +# description: Test that invalid unicodes return correct info as rejects tables +# group: [rejects] + +require skip_reload + +statement ok +from read_csv('test/sql/copy/csv/data/test/invalid_utf_big.csv',columns = {'col1': 'VARCHAR','col2': 'VARCHAR','col3': 'VARCHAR'}, + auto_detect=false, rejects_table='csv_rejects_table', header = 0, delim = ',', ignore_errors=true) + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- + diff --git a/third_party/utf8proc/include/utf8proc_wrapper.hpp b/third_party/utf8proc/include/utf8proc_wrapper.hpp index fb988b254b76..a7fb8c662b24 100644 --- a/third_party/utf8proc/include/utf8proc_wrapper.hpp +++ b/third_party/utf8proc/include/utf8proc_wrapper.hpp @@ -26,6 +26,8 @@ class Utf8Proc { static char* Normalize(const char* s, size_t len); //! Returns whether or not the UTF8 string is valid static bool IsValid(const char *s, size_t len); + //! Makes Invalid Unicode valid by replacing invalid parts with a given character + static void MakeValid(char *s, size_t len, char special_flag = '?'); //! Returns the position (in bytes) of the next grapheme cluster static size_t NextGraphemeCluster(const char *s, size_t len, size_t pos); //! Returns the position (in bytes) of the previous grapheme cluster diff --git a/third_party/utf8proc/utf8proc_wrapper.cpp b/third_party/utf8proc/utf8proc_wrapper.cpp index c47472a39eb6..02f6c0efc5de 100644 --- a/third_party/utf8proc/utf8proc_wrapper.cpp +++ b/third_party/utf8proc/utf8proc_wrapper.cpp @@ -102,6 +102,39 @@ UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *i return type; } +void Utf8Proc::MakeValid(char *s, size_t len, char special_flag){ + UnicodeType type = UnicodeType::ASCII; + for (size_t i = 0; i < len; i++) { + int c = (int) s[i]; + if ((c & 0x80) == 0) { + continue; + } + int first_pos_seq = i; + if ((c & 0xE0) == 0xC0) { + /* 2 byte sequence */ + int utf8char = c & 0x1F; + type = UTF8ExtraByteLoop<1, 0x000780>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr); + } else if ((c & 0xF0) == 0xE0) { + /* 3 byte sequence */ + int utf8char = c & 0x0F; + type = UTF8ExtraByteLoop<2, 0x00F800>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr); + } else if ((c & 0xF8) == 0xF0) { + /* 4 byte sequence */ + int utf8char = c & 0x07; + type = UTF8ExtraByteLoop<3, 0x1F0000>(first_pos_seq, utf8char, i, s, len, nullptr, nullptr); + } else { + /* invalid UTF-8 start byte */ + s[i] = special_flag; // Rewrite invalid byte + } + if (type == UnicodeType::INVALID) { + for (size_t j = first_pos_seq; j <= i; j++) { + s[j] = special_flag; // Rewrite each byte of the invalid sequence + } + type = UnicodeType::ASCII; + } + } +} + char* Utf8Proc::Normalize(const char *s, size_t len) { assert(s); assert(Utf8Proc::Analyze(s, len) != UnicodeType::INVALID); From 4ad73e518e0f945bf2760712d6cc91a009024e58 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 6 Mar 2024 18:41:45 +0100 Subject: [PATCH 028/147] invalid utf working --- src/execution/operator/persistent/csv_rejects_table.cpp | 2 +- test/sql/copy/csv/rejects/test_invalid_utf_rejects.test | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index 4140b5d3eac5..a2f80d855d15 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -25,7 +25,7 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData // Create CSV_ERROR_TYPE ENUM string enum_name = "CSV_ERROR_TYPE"; - Vector order_errors(LogicalType::VARCHAR, 5); + Vector order_errors(LogicalType::VARCHAR, 6); order_errors.SetValue(0, "CAST"); order_errors.SetValue(1, "MISSING COLUMNS"); order_errors.SetValue(2, "TOO MANY COLUMNS"); diff --git a/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test index bcf1b6f64ecb..b2082773ae5e 100644 --- a/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test +++ b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test @@ -12,4 +12,7 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- - +test/sql/copy/csv/data/test/invalid_utf_big.csv 3001 2 "col3" INVALID UNICODE valid,invalid_??_part,valid 54000 +test/sql/copy/csv/data/test/invalid_utf_big.csv 3012 2 "col3" INVALID UNICODE valid,valid,invalid_??_part 54208 +test/sql/copy/csv/data/test/invalid_utf_big.csv 3023 2 "col3" INVALID UNICODE valid,invalid_??_part,valid 54416 +test/sql/copy/csv/data/test/invalid_utf_big.csv 3034 2 "col3" INVALID UNICODE valid,valid,invalid_??_part 54624 From cd1c2d70b28a4f94f9d77cbc107f6b510fcb1aee Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 7 Mar 2024 11:51:24 +0100 Subject: [PATCH 029/147] tidy happy and skip tests on windows due do byte_position mismatch --- .../csv_scanner/scanner/string_value_scanner.cpp | 10 +++++----- test/sql/copy/csv/rejects/csv_buffer_size_rejects.test | 3 +++ .../rejects/csv_incorrect_columns_amount_rejects.test | 3 +++ test/sql/copy/csv/rejects/csv_rejects_auto.test | 3 +++ test/sql/copy/csv/rejects/csv_rejects_flush_cast.test | 5 +++++ test/sql/copy/csv/rejects/csv_rejects_read.test | 3 +++ test/sql/copy/csv/rejects/test_invalid_parameters.test | 3 +++ .../sql/copy/csv/rejects/test_invalid_utf_rejects.test | 3 +++ 8 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index a87f43917e1b..6e9ecdd1d1c3 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -341,10 +341,10 @@ void StringValueResult::HandleUnicodeError(bool force_error) { bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); // sanitize borked line - std::vector charArray(borked_line.begin(), borked_line.end()); - charArray.push_back('\0'); // Null-terminate the character array - Utf8Proc::MakeValid(&charArray[0], charArray.size()); - borked_line = {charArray.begin(), charArray.end() - 1}; + std::vector char_array(borked_line.begin(), borked_line.end()); + char_array.push_back('\0'); // Null-terminate the character array + Utf8Proc::MakeValid(&char_array[0], char_array.size()); + borked_line = {char_array.begin(), char_array.end() - 1}; LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); auto csv_error = CSVError::InvalidUTF8(state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); @@ -426,7 +426,7 @@ bool StringValueResult::AddRowInternal() { HandleUnicodeError(); break; default: - InvalidInputException("CSV Error not allowed when inserting row"); + throw InvalidInputException("CSV Error not allowed when inserting row"); } cur_col_id = 0; chunk_col_id = 0; diff --git a/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test index dcef91e814ee..a8fd11728781 100644 --- a/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test +++ b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test @@ -4,6 +4,9 @@ require skip_reload +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + loop buffer_size 5 10 # Ensure that we can get the schema if we reduce the sample size and ignore errors diff --git a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test index 414e9fdd8594..070b413a8497 100644 --- a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test +++ b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test @@ -4,6 +4,9 @@ require skip_reload +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + statement ok SELECT * FROM read_csv( 'data/csv/rejects/incorrect_columns/few_columns.csv', diff --git a/test/sql/copy/csv/rejects/csv_rejects_auto.test b/test/sql/copy/csv/rejects/csv_rejects_auto.test index 841ed42465f3..febda7d1c0fc 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_auto.test +++ b/test/sql/copy/csv/rejects/csv_rejects_auto.test @@ -3,6 +3,9 @@ require skip_reload +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + # Ensure that we can get the schema if we reduce the sample size and ignore errors query IIIII SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( diff --git a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test index 6b2f5e59d7f5..20ff320b36a5 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test +++ b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test @@ -2,6 +2,11 @@ # description: Test that Flush Cast functions properly for the rejects tables # group: [rejects] +require skip_reload + +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + query III SELECT typeof(first(a)), typeof(first(b)), COUNT(*) FROM read_csv( 'data/csv/error/flush_cast.csv', diff --git a/test/sql/copy/csv/rejects/csv_rejects_read.test b/test/sql/copy/csv/rejects/csv_rejects_read.test index a0b2e751289d..8cf3d5ac883f 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_read.test +++ b/test/sql/copy/csv/rejects/csv_rejects_read.test @@ -3,6 +3,9 @@ require skip_reload +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + query III rowsort SELECT * FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/bad.csv', diff --git a/test/sql/copy/csv/rejects/test_invalid_parameters.test b/test/sql/copy/csv/rejects/test_invalid_parameters.test index 2e343a30765d..9325f3780f24 100644 --- a/test/sql/copy/csv/rejects/test_invalid_parameters.test +++ b/test/sql/copy/csv/rejects/test_invalid_parameters.test @@ -3,6 +3,9 @@ require skip_reload +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + # Test invalid arguments statement error SELECT * FROM read_csv( diff --git a/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test index b2082773ae5e..52ff0ac19823 100644 --- a/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test +++ b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test @@ -4,6 +4,9 @@ require skip_reload +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + statement ok from read_csv('test/sql/copy/csv/data/test/invalid_utf_big.csv',columns = {'col1': 'VARCHAR','col2': 'VARCHAR','col3': 'VARCHAR'}, auto_detect=false, rejects_table='csv_rejects_table', header = 0, delim = ',', ignore_errors=true) From 2bd5af56ffb1dc9d3dd7ef64259f0bb23969d795 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 7 Mar 2024 12:12:00 +0100 Subject: [PATCH 030/147] Adding tests and fixes for rejects over maximum line size --- data/csv/rejects/maximum_line/max_10.csv | 8 + data/csv/rejects/maximum_line/over_vector.csv | 2926 +++++++++++++++++ .../scanner/string_value_scanner.cpp | 6 +- .../csv/rejects/csv_rejects_maximum_line.test | 86 + 4 files changed, 3023 insertions(+), 3 deletions(-) create mode 100644 data/csv/rejects/maximum_line/max_10.csv create mode 100644 data/csv/rejects/maximum_line/over_vector.csv create mode 100644 test/sql/copy/csv/rejects/csv_rejects_maximum_line.test diff --git a/data/csv/rejects/maximum_line/max_10.csv b/data/csv/rejects/maximum_line/max_10.csv new file mode 100644 index 000000000000..02a3683c09e0 --- /dev/null +++ b/data/csv/rejects/maximum_line/max_10.csv @@ -0,0 +1,8 @@ +a,b +bla,1 +bla,2 +bla,3 +blaaaaaaaaaaaaaa,4 +bla,1 +bla,2 +bla,3 \ No newline at end of file diff --git a/data/csv/rejects/maximum_line/over_vector.csv b/data/csv/rejects/maximum_line/over_vector.csv new file mode 100644 index 000000000000..c8fe70a412a4 --- /dev/null +++ b/data/csv/rejects/maximum_line/over_vector.csv @@ -0,0 +1,2926 @@ +a,b +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +blaaaaaaaaaaaaaaaaaaaa,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +blaaaaaaaaaaaaaaaaaaaa,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +bla,3 +bla,1 +bla,2 +blaaaaaaaaaaaaaaaaaaaa,3 +bla,1 +bla,2 +bla,3 diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 6e9ecdd1d1c3..c9207c2298ee 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -406,17 +406,17 @@ bool StringValueResult::AddRowInternal() { if (store_line_size) { error_handler.NewMaxLineSize(current_line_size); } + current_line_position.begin = current_line_position.end; + current_line_position.end = current_line_start; if (current_line_size > state_machine.options.maximum_line_size) { bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); - LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), number_of_rows); + LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); auto csv_error = CSVError::LineSizeError(state_machine.options, current_line_size, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); } - current_line_position.begin = current_line_position.end; - current_line_position.end = current_line_start; if (current_error.is_set) { switch (current_error.type) { case CSVErrorType::TOO_MANY_COLUMNS: diff --git a/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test b/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test new file mode 100644 index 000000000000..f7bb5447485c --- /dev/null +++ b/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test @@ -0,0 +1,86 @@ +# name: test/sql/copy/csv/rejects/csv_rejects_maximum_line.test +# description: Tests rejects tables on max line size parameter +# group: [rejects] + +require skip_reload + +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + +statement ok +SELECT * FROM read_csv( + 'data/csv/rejects/maximum_line/max_10.csv', + columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1, max_line_size=10); + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/maximum_line/max_10.csv 5 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 + +statement ok +DROP TABLE csv_rejects_table; + +# Test with buffer sizes + +loop buffer_size 22 27 + +statement ok +SELECT * FROM read_csv( + 'data/csv/rejects/maximum_line/max_10.csv', + columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1, max_line_size=10, buffer_size=${buffer_size}); + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/maximum_line/max_10.csv 5 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 + +statement ok +DROP TABLE csv_rejects_table; + +endloop + +# Test over vector size file +statement ok +SELECT * FROM read_csv( + 'data/csv/rejects/maximum_line/over_vector.csv', + columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1, max_line_size=20); + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/maximum_line/over_vector.csv 2282 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 13684 +data/csv/rejects/maximum_line/over_vector.csv 2591 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 15557 +data/csv/rejects/maximum_line/over_vector.csv 2923 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 17568 + +statement ok +DROP TABLE csv_rejects_table; + +# Read Multiple Files + +statement ok +SELECT * FROM read_csv( + 'data/csv/rejects/maximum_line/*.csv', + columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1, max_line_size=10); + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/maximum_line/max_10.csv 5 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 +data/csv/rejects/maximum_line/over_vector.csv 2282 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 13684 +data/csv/rejects/maximum_line/over_vector.csv 2591 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 15557 +data/csv/rejects/maximum_line/over_vector.csv 2923 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 17568 + +statement ok +DROP TABLE csv_rejects_table; \ No newline at end of file From 73e8c36c6b537aa3df0bed2b2980fd2b2e6ef75b Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 7 Mar 2024 15:29:07 +0100 Subject: [PATCH 031/147] Properly handling unquoted values w ignore_errors --- data/csv/rejects/unquoted/basic.csv | 8 ++ .../scanner/string_value_scanner.cpp | 73 ++++++++++++------- .../state_machine/csv_state_machine_cache.cpp | 20 ++--- .../operator/csv_scanner/csv_error.hpp | 18 ++--- .../csv_scanner/string_value_scanner.hpp | 5 ++ .../csv/rejects/csv_unquoted_rejects.test | 31 ++++++++ 6 files changed, 110 insertions(+), 45 deletions(-) create mode 100644 data/csv/rejects/unquoted/basic.csv create mode 100644 test/sql/copy/csv/rejects/csv_unquoted_rejects.test diff --git a/data/csv/rejects/unquoted/basic.csv b/data/csv/rejects/unquoted/basic.csv new file mode 100644 index 000000000000..8f7dc567086a --- /dev/null +++ b/data/csv/rejects/unquoted/basic.csv @@ -0,0 +1,8 @@ +a,b +"bla",1 +"bla",2 +"bla",3 +"blaaaaaaaaaaaaaa"bla,4 +"bla",1 +"bla",2 +"bla",3 \ No newline at end of file diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index c9207c2298ee..180563271a74 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -351,6 +351,40 @@ void StringValueResult::HandleUnicodeError(bool force_error) { error_handler.Error(csv_error, force_error); } +void StringValueResult::HandleUnterminatedQuotes(bool force_error) { + LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); + bool first_nl; + auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); + auto csv_error = + CSVError::UnterminatedQuotesError(state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); + error_handler.Error(csv_error, force_error); +} + +bool StringValueResult::HandleError() { + if (current_error.is_set) { + switch (current_error.type) { + case CSVErrorType::TOO_MANY_COLUMNS: + HandleOverLimitRows(); + break; + case CSVErrorType::INVALID_UNICODE: + HandleUnicodeError(); + break; + case CSVErrorType::UNTERMINATED_QUOTES: + HandleUnterminatedQuotes(); + break; + default: + throw InvalidInputException("CSV Error not allowed when inserting row"); + } + cur_col_id = 0; + chunk_col_id = 0; + // An error occurred on this row, we are ignoring it and resetting our control flag + current_error.Reset(); + return true; + } + return false; +} + void StringValueResult::QuotedNewLine(StringValueResult &result) { result.quoted_new_line = true; } @@ -360,7 +394,7 @@ void StringValueResult::NullPaddingQuotedNewlineCheck() { if (state_machine.options.null_padding && iterator.IsBoundarySet() && quoted_new_line && iterator.done) { // If we have null_padding set, we found a quoted new line, we are scanning the file in parallel and it's the // last row of this thread. - LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), number_of_rows + 1); + LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); auto csv_error = CSVError::NullPaddingFail(state_machine.options, lines_per_batch); error_handler.Error(csv_error); } @@ -417,21 +451,7 @@ bool StringValueResult::AddRowInternal() { current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); } - if (current_error.is_set) { - switch (current_error.type) { - case CSVErrorType::TOO_MANY_COLUMNS: - HandleOverLimitRows(); - break; - case CSVErrorType::INVALID_UNICODE: - HandleUnicodeError(); - break; - default: - throw InvalidInputException("CSV Error not allowed when inserting row"); - } - cur_col_id = 0; - chunk_col_id = 0; - // An error occurred on this row, we are ignoring it and resetting our control flag - current_error.Reset(); + if (HandleError()) { return false; } if (!cast_errors.empty()) { @@ -533,15 +553,12 @@ bool StringValueResult::AddRow(StringValueResult &result, const idx_t buffer_pos } void StringValueResult::InvalidState(StringValueResult &result) { - // FIXME: How do we recover from an invalid state? Can we restart the state machine and jump to the next row? - LinesPerBoundary lines_per_batch(result.iterator.GetBoundaryIdx(), result.number_of_rows); - bool first_nl; - auto borked_line = result.current_line_position.ReconstructCurrentLine(first_nl, result.buffer_handles); - - auto csv_error = CSVError::UnterminatedQuotesError( - result.state_machine.options, result.cur_col_id, lines_per_batch, borked_line, - result.current_line_position.begin.GetGlobalPosition(result.requested_size, first_nl)); - result.error_handler.Error(csv_error); + bool force_error = !result.state_machine.options.ignore_errors && result.sniffing; + // Invalid unicode, we must error + if (force_error) { + result.HandleUnicodeError(force_error); + } + result.current_error = {CSVErrorType::UNTERMINATED_QUOTES}; } bool StringValueResult::EmptyLine(StringValueResult &result, const idx_t buffer_pos) { @@ -1205,7 +1222,9 @@ void StringValueScanner::FinalizeChunkProcess() { // If we are not done we have two options. // 1) If a boundary is set. if (iterator.IsBoundarySet()) { - iterator.done = true; + if (!(result.current_error == CSVErrorType::UNTERMINATED_QUOTES)) { + iterator.done = true; + } // We read until the next line or until we have nothing else to read. // Move to next buffer if (!cur_buffer_handle) { @@ -1221,6 +1240,8 @@ void StringValueScanner::FinalizeChunkProcess() { if (cur_buffer_handle->is_last_buffer && iterator.pos.buffer_pos >= cur_buffer_handle->actual_size) { MoveToNextBuffer(); } + } else { + result.HandleError(); } } else { // 2) If a boundary is not set diff --git a/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp b/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp index 1f9768826edd..fbe0752311df 100644 --- a/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp +++ b/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp @@ -21,7 +21,6 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op InitializeTransitionArray(transition_array, cur_state, CSVState::QUOTED); break; case CSVState::UNQUOTED: - case CSVState::INVALID: case CSVState::ESCAPE: InitializeTransitionArray(transition_array, cur_state, CSVState::INVALID); break; @@ -38,15 +37,16 @@ void CSVStateMachineCache::Insert(const CSVStateMachineOptions &state_machine_op auto new_line_id = state_machine_options.new_line.GetValue(); // Now set values depending on configuration - // 1) Standard State - transition_array[delimiter][static_cast(static_cast(CSVState::STANDARD))] = CSVState::DELIMITER; - transition_array[static_cast('\n')][static_cast(CSVState::STANDARD)] = CSVState::RECORD_SEPARATOR; - if (new_line_id == NewLineIdentifier::CARRY_ON) { - transition_array[static_cast('\r')][static_cast(CSVState::STANDARD)] = - CSVState::CARRIAGE_RETURN; - } else { - transition_array[static_cast('\r')][static_cast(CSVState::STANDARD)] = - CSVState::RECORD_SEPARATOR; + // 1) Standard/Invalid State + vector std_inv {static_cast(CSVState::STANDARD), static_cast(CSVState::INVALID)}; + for (auto &state : std_inv) { + transition_array[delimiter][state] = CSVState::DELIMITER; + transition_array[static_cast('\n')][state] = CSVState::RECORD_SEPARATOR; + if (new_line_id == NewLineIdentifier::CARRY_ON) { + transition_array[static_cast('\r')][state] = CSVState::CARRIAGE_RETURN; + } else { + transition_array[static_cast('\r')][state] = CSVState::RECORD_SEPARATOR; + } } // 2) Field Separator State transition_array[delimiter][static_cast(CSVState::DELIMITER)] = CSVState::DELIMITER; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index 3ced0619aa67..befc3a669219 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -37,15 +37,15 @@ class LinesPerBoundary { }; enum CSVErrorType : uint8_t { - CAST_ERROR = 0, // If when casting a value from string to the column type fails - COLUMN_NAME_TYPE_MISMATCH = 1, // If there is a mismatch between Column Names and Types - TOO_FEW_COLUMNS = 2, // If the CSV has too few columns - TOO_MANY_COLUMNS = 3, // If the CSV has too many column - UNTERMINATED_QUOTES = 4, // If a quote is not terminated - SNIFFING = 5, // If something went wrong during sniffing and was not possible to find suitable candidates - MAXIMUM_LINE_SIZE = 6, // Maximum line size was exceeded by a line in the CSV File - NULLPADDED_QUOTED_NEW_VALUE = 7, // If the null_padding option is set, and we have quoted new values in parallel - INVALID_UNICODE = 8 + CAST_ERROR = 0, //! If when casting a value from string to the column type fails + COLUMN_NAME_TYPE_MISMATCH = 1, //! If there is a mismatch between Column Names and Types + TOO_FEW_COLUMNS = 2, //! If the CSV has too few columns + TOO_MANY_COLUMNS = 3, //! If the CSV has too many column + UNTERMINATED_QUOTES = 4, //! If a quote is not terminated + SNIFFING = 5, //! If something went wrong during sniffing and was not possible to find suitable candidates + MAXIMUM_LINE_SIZE = 6, //! Maximum line size was exceeded by a line in the CSV File + NULLPADDED_QUOTED_NEW_VALUE = 7, //! If the null_padding option is set, and we have quoted new values in parallel + INVALID_UNICODE = 8 //! If we have invalid unicode values }; class CSVError { diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 18d1776b7a71..3869f0b94d5d 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -71,6 +71,9 @@ class CurrentError { } bool is_set; CSVErrorType type; + friend bool operator==(const CurrentError &error, CSVErrorType other) { + return error.is_set && error.type == other; + } }; class StringValueResult : public ScannerResult { @@ -152,6 +155,8 @@ class StringValueResult : public ScannerResult { void HandleOverLimitRows(); void HandleUnicodeError(bool force_error = false); + void HandleUnterminatedQuotes(bool force_error = false); + bool HandleError(); inline void AddValueToVector(const char *value_ptr, const idx_t size, bool allocate = false); diff --git a/test/sql/copy/csv/rejects/csv_unquoted_rejects.test b/test/sql/copy/csv/rejects/csv_unquoted_rejects.test new file mode 100644 index 000000000000..a50aef9c60da --- /dev/null +++ b/test/sql/copy/csv/rejects/csv_unquoted_rejects.test @@ -0,0 +1,31 @@ +# name: test/sql/copy/csv/rejects/csv_unquoted_rejects.test +# description: Tests rejects tables on max line size parameter +# group: [rejects] + +require skip_reload + +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + +query II +SELECT * FROM read_csv( + 'data/csv/rejects/unquoted/basic.csv', + columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1, quote = '"', escape = '"'); +---- +bla 1 +bla 2 +bla 3 +bla 1 +bla 2 +bla 3 + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/unquoted/basic.csv 5 1 "b" UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 28 + +statement ok +DROP TABLE csv_rejects_table; From bc26aa060be9a85c56e3a3ef34b6353214020dac Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 7 Mar 2024 18:40:08 +0100 Subject: [PATCH 032/147] More work on rejects + unquoted --- .../rejects/unquoted/unquoted_last_value.csv | 5 ++ .../rejects/unquoted/unquoted_new_line.csv | 9 +++ .../scanner/string_value_scanner.cpp | 28 ++++---- .../operator/csv_scanner/base_scanner.hpp | 2 +- .../csv_scanner/string_value_scanner.hpp | 9 +-- .../csv/rejects/csv_unquoted_rejects.test | 68 ++++++++++++++++++- 6 files changed, 101 insertions(+), 20 deletions(-) create mode 100644 data/csv/rejects/unquoted/unquoted_last_value.csv create mode 100644 data/csv/rejects/unquoted/unquoted_new_line.csv diff --git a/data/csv/rejects/unquoted/unquoted_last_value.csv b/data/csv/rejects/unquoted/unquoted_last_value.csv new file mode 100644 index 000000000000..0d714083e9c8 --- /dev/null +++ b/data/csv/rejects/unquoted/unquoted_last_value.csv @@ -0,0 +1,5 @@ +"blaaaaaaaaaaaaaa" +"bla" +"bla" +"bla" +"bla diff --git a/data/csv/rejects/unquoted/unquoted_new_line.csv b/data/csv/rejects/unquoted/unquoted_new_line.csv new file mode 100644 index 000000000000..e42978c5565e --- /dev/null +++ b/data/csv/rejects/unquoted/unquoted_new_line.csv @@ -0,0 +1,9 @@ +a,b +"bla",1 +"bla",2 +"bla",3 +"blaaaaaaaaaaaaaa +"bla,4 +"bla",1 +"bla",2 +"bla",3 \ No newline at end of file diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 180563271a74..b2fc8baa8aa4 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -122,7 +122,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size error = !IsValueNull(null_str_ptr, value_ptr, size); } if (error) { - current_error = {CSVErrorType::TOO_MANY_COLUMNS}; + current_error = {CSVErrorType::TOO_MANY_COLUMNS, cur_col_id}; } return; } @@ -222,10 +222,10 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size bool force_error = !state_machine.options.ignore_errors && sniffing; // Invalid unicode, we must error if (force_error) { - HandleUnicodeError(force_error); + HandleUnicodeError(cur_col_id, force_error); } // If we got here, we are ingoring errors, hence we must ignore this line. - current_error = {CSVErrorType::INVALID_UNICODE}; + current_error = {CSVErrorType::INVALID_UNICODE, cur_col_id}; break; } if (allocate) { @@ -327,17 +327,17 @@ void StringValueResult::AddValue(StringValueResult &result, const idx_t buffer_p result.last_position = buffer_pos + 1; } -void StringValueResult::HandleOverLimitRows() { +void StringValueResult::HandleOverLimitRows(idx_t col_idx) { LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); auto csv_error = - CSVError::IncorrectColumnAmountError(state_machine.options, cur_col_id + 1, lines_per_batch, borked_line, + CSVError::IncorrectColumnAmountError(state_machine.options, col_idx, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); } -void StringValueResult::HandleUnicodeError(bool force_error) { +void StringValueResult::HandleUnicodeError(idx_t col_idx, bool force_error) { bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); // sanitize borked line @@ -346,17 +346,17 @@ void StringValueResult::HandleUnicodeError(bool force_error) { Utf8Proc::MakeValid(&char_array[0], char_array.size()); borked_line = {char_array.begin(), char_array.end() - 1}; LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); - auto csv_error = CSVError::InvalidUTF8(state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, + auto csv_error = CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error, force_error); } -void StringValueResult::HandleUnterminatedQuotes(bool force_error) { +void StringValueResult::HandleUnterminatedQuotes(idx_t col_idx, bool force_error) { LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); auto csv_error = - CSVError::UnterminatedQuotesError(state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, + CSVError::UnterminatedQuotesError(state_machine.options, col_idx, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error, force_error); } @@ -365,13 +365,13 @@ bool StringValueResult::HandleError() { if (current_error.is_set) { switch (current_error.type) { case CSVErrorType::TOO_MANY_COLUMNS: - HandleOverLimitRows(); + HandleOverLimitRows(current_error.col_idx); break; case CSVErrorType::INVALID_UNICODE: - HandleUnicodeError(); + HandleUnicodeError(current_error.col_idx); break; case CSVErrorType::UNTERMINATED_QUOTES: - HandleUnterminatedQuotes(); + HandleUnterminatedQuotes(current_error.col_idx); break; default: throw InvalidInputException("CSV Error not allowed when inserting row"); @@ -556,9 +556,9 @@ void StringValueResult::InvalidState(StringValueResult &result) { bool force_error = !result.state_machine.options.ignore_errors && result.sniffing; // Invalid unicode, we must error if (force_error) { - result.HandleUnicodeError(force_error); + result.HandleUnicodeError(result.cur_col_id, force_error); } - result.current_error = {CSVErrorType::UNTERMINATED_QUOTES}; + result.current_error = {CSVErrorType::UNTERMINATED_QUOTES, result.cur_col_id}; } bool StringValueResult::EmptyLine(StringValueResult &result, const idx_t buffer_pos) { diff --git a/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp index c8a2f886fa9f..29a62b8e79ae 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/base_scanner.hpp @@ -252,7 +252,7 @@ class BaseScanner { Initialize(); initialized = true; } - if (!iterator.done) { + if (!iterator.done && cur_buffer_handle) { Process(result); } FinalizeChunkProcess(); diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 3869f0b94d5d..e36266d90f69 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -65,12 +65,13 @@ class FullLinePosition { class CurrentError { public: CurrentError() : is_set(false) {}; - CurrentError(CSVErrorType type) : is_set(true), type(type) {}; + CurrentError(CSVErrorType type, idx_t col_idx_p) : is_set(true), type(type), col_idx(col_idx_p) {}; void Reset() { is_set = false; } bool is_set; CSVErrorType type; + idx_t col_idx; friend bool operator==(const CurrentError &error, CSVErrorType other) { return error.is_set && error.type == other; } @@ -153,9 +154,9 @@ class StringValueResult : public ScannerResult { static inline bool EmptyLine(StringValueResult &result, const idx_t buffer_pos); inline bool AddRowInternal(); - void HandleOverLimitRows(); - void HandleUnicodeError(bool force_error = false); - void HandleUnterminatedQuotes(bool force_error = false); + void HandleOverLimitRows(idx_t col_idx); + void HandleUnicodeError(idx_t col_idx, bool force_error = false); + void HandleUnterminatedQuotes(idx_t col_idx, bool force_error = false); bool HandleError(); inline void AddValueToVector(const char *value_ptr, const idx_t size, bool allocate = false); diff --git a/test/sql/copy/csv/rejects/csv_unquoted_rejects.test b/test/sql/copy/csv/rejects/csv_unquoted_rejects.test index a50aef9c60da..976b66d0ff05 100644 --- a/test/sql/copy/csv/rejects/csv_unquoted_rejects.test +++ b/test/sql/copy/csv/rejects/csv_unquoted_rejects.test @@ -25,7 +25,73 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -data/csv/rejects/unquoted/basic.csv 5 1 "b" UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 28 +data/csv/rejects/unquoted/basic.csv 5 0 "a" UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 28 statement ok DROP TABLE csv_rejects_table; + +query II +SELECT * FROM read_csv( + 'data/csv/rejects/unquoted/unquoted_new_line.csv', + columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1, quote = '"', escape = '"'); +---- +bla 1 +bla 2 +bla 3 +bla 1 +bla 2 +bla 3 + +query IIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/unquoted/unquoted_new_line.csv 5 0 "a" UNQUOTED VALUE 28 + +statement ok +DROP TABLE csv_rejects_table; + +query I +SELECT * FROM read_csv( + 'data/csv/rejects/unquoted/unquoted_last_value.csv', + columns = {'a': 'VARCHAR'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 0, quote = '"', escape = '"'); +---- +blaaaaaaaaaaaaaa +bla +bla +bla + +query IIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/unquoted/unquoted_last_value.csv 5 0 "a" UNQUOTED VALUE 31 + +statement ok +DROP TABLE csv_rejects_table; + +# Test buffer sizes (borked :( ) +# +#loop buffer_size 35 1001 +# +#statement ok +#SELECT * FROM read_csv( +# 'data/csv/rejects/unquoted/basic.csv', +# columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, +# rejects_table='csv_rejects_table', +# ignore_errors=true, auto_detect=false, header = 1, quote = '"', escape = '"', buffer_size=${buffer_size}); +# +#query IIIIIII rowsort +#SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +#FROM csv_rejects_table; +#---- +#data/csv/rejects/unquoted/basic.csv 5 0 "a" UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 28 +# +#statement ok +#DROP TABLE csv_rejects_table; +# +#endloop \ No newline at end of file From 0a87b47e63197896bc3361a805d514840de3e1b8 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 8 Mar 2024 14:27:22 +0100 Subject: [PATCH 033/147] Make column idx 1-indexes --- .../scanner/string_value_scanner.cpp | 4 +- .../table_function/global_csv_state.cpp | 19 ++--- .../operator/csv_scanner/util/csv_error.cpp | 4 +- .../csv/rejects/csv_buffer_size_rejects.test | 8 +- .../copy/csv/rejects/csv_rejects_auto.test | 16 ++-- .../csv/rejects/csv_rejects_flush_cast.test | 4 +- .../csv/rejects/csv_rejects_maximum_line.test | 18 ++--- .../copy/csv/rejects/csv_rejects_read.test | 74 +++++++++---------- .../csv/rejects/csv_unquoted_rejects.test | 6 +- .../csv/rejects/test_invalid_utf_rejects.test | 8 +- 10 files changed, 81 insertions(+), 80 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index b2fc8baa8aa4..0dea719fb455 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -365,7 +365,7 @@ bool StringValueResult::HandleError() { if (current_error.is_set) { switch (current_error.type) { case CSVErrorType::TOO_MANY_COLUMNS: - HandleOverLimitRows(current_error.col_idx); + HandleOverLimitRows(cur_col_id); break; case CSVErrorType::INVALID_UNICODE: HandleUnicodeError(current_error.col_idx); @@ -510,7 +510,7 @@ bool StringValueResult::AddRowInternal() { auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); auto csv_error = CSVError::IncorrectColumnAmountError( - state_machine.options, cur_col_id, lines_per_batch, borked_line, + state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); // If we are here we ignore_errors, so we delete this line diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 6380d2e24c6c..1d012349da21 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -197,11 +197,6 @@ void CSVGlobalState::FillRejectsTable() { rejects->count++; auto row_line = file->error_handler->GetLine(error.error_info); auto col_idx = error.column_idx; - string col_name; - if (error.type != CSVErrorType::TOO_MANY_COLUMNS) { - // Too many columns does not have a name, all other errors have - col_name = bind_data.return_names[col_idx]; - } // Add the row to the rejects table appender.BeginRow(); // 1. File Path @@ -211,12 +206,18 @@ void CSVGlobalState::FillRejectsTable() { // 3. Byte Position where error occurred appender.Append(error.byte_position); // 4. Column Index - appender.Append(col_idx); + appender.Append(col_idx + 1); // 5. Column Name (If Applicable) - if (col_name.empty()) { + switch (error.type) { + case CSVErrorType::TOO_MANY_COLUMNS: appender.Append(Value()); - } else { - appender.Append(string_t("\"" + col_name + "\"")); + break; + case CSVErrorType::TOO_FEW_COLUMNS: + D_ASSERT(bind_data.return_names.size() > col_idx + 1); + appender.Append(string_t("\"" + bind_data.return_names[col_idx + 1] + "\"")); + break; + default: + appender.Append(string_t("\"" + bind_data.return_names[col_idx] + "\"")); } // 6. Error Type appender.Append(string_t(CSVErrorTypeToEnum(error.type))); diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index 9c1eaffd394f..fadfad6716ff 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -157,11 +157,11 @@ CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, i LinesPerBoundary error_info, string &csv_row, idx_t byte_position) { std::ostringstream error; // How many columns were expected and how many were found - error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns + error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns + 1 << std::endl; // What were the options error << options.ToString(); - if (actual_columns > options.dialect_options.num_cols) { + if (actual_columns >= options.dialect_options.num_cols) { return CSVError(error.str(), CSVErrorType::TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, byte_position); } else { diff --git a/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test index a8fd11728781..76b95cfbe731 100644 --- a/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test +++ b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test @@ -24,10 +24,10 @@ query IIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/error/mismatch/big_bad.csv 0 "column0" CAST B, A 10875 -test/sql/copy/csv/data/error/mismatch/big_bad.csv 0 "column0" CAST C, A 20875 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 0 "column0" CAST B, A 18395 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 0 "column0" CAST C, A 28395 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 1 "column0" CAST B, A 10875 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 1 "column0" CAST C, A 20875 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 1 "column0" CAST B, A 18395 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 1 "column0" CAST C, A 28395 query I SELECT error_message diff --git a/test/sql/copy/csv/rejects/csv_rejects_auto.test b/test/sql/copy/csv/rejects/csv_rejects_auto.test index febda7d1c0fc..bfa8073a6567 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_auto.test +++ b/test/sql/copy/csv/rejects/csv_rejects_auto.test @@ -20,32 +20,32 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 0 "column0" CAST B, A 10875 -test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 0 "column0" CAST C, A 20875 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 0 "column0" CAST B, A 18395 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 0 "column0" CAST C, A 28395 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 1 "column0" CAST B, A 10875 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 1 "column0" CAST C, A 20875 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 1 "column0" CAST B, A 18395 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 1 "column0" CAST C, A 28395 query I SELECT error_message -FROM csv_rejects_table where line=2176 and column_idx=0; +FROM csv_rejects_table where line=2176 and column_idx=1; ---- :.*Could not convert string "B" to 'BIGINT'.* query I SELECT error_message -FROM csv_rejects_table where line=4176 and column_idx=0; +FROM csv_rejects_table where line=4176 and column_idx=1; ---- :.*Could not convert string "C" to 'BIGINT'.* query I SELECT error_message -FROM csv_rejects_table where line=3680 and column_idx=0; +FROM csv_rejects_table where line=3680 and column_idx=1; ---- :.*Could not convert string "B" to 'BIGINT'.* query I SELECT error_message -FROM csv_rejects_table where line=5680 and column_idx=0; +FROM csv_rejects_table where line=5680 and column_idx=1; ---- :.*Could not convert string "C" to 'BIGINT'.* diff --git a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test index 20ff320b36a5..69530026555e 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test +++ b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test @@ -23,8 +23,8 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table order by all; ---- -data/csv/error/flush_cast.csv 2813 0 "a" CAST c, bla 44971 -data/csv/error/flush_cast.csv 439 0 "a" CAST B, bla 6996 +data/csv/error/flush_cast.csv 2813 1 "a" CAST c, bla 44971 +data/csv/error/flush_cast.csv 439 1 "a" CAST B, bla 6996 query I SELECT error_message diff --git a/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test b/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test index f7bb5447485c..f6214aab0906 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test +++ b/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test @@ -18,7 +18,7 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -data/csv/rejects/maximum_line/max_10.csv 5 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 +data/csv/rejects/maximum_line/max_10.csv 5 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 statement ok DROP TABLE csv_rejects_table; @@ -38,7 +38,7 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -data/csv/rejects/maximum_line/max_10.csv 5 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 +data/csv/rejects/maximum_line/max_10.csv 5 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 statement ok DROP TABLE csv_rejects_table; @@ -57,9 +57,9 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -data/csv/rejects/maximum_line/over_vector.csv 2282 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 13684 -data/csv/rejects/maximum_line/over_vector.csv 2591 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 15557 -data/csv/rejects/maximum_line/over_vector.csv 2923 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 17568 +data/csv/rejects/maximum_line/over_vector.csv 2282 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 13684 +data/csv/rejects/maximum_line/over_vector.csv 2591 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 15557 +data/csv/rejects/maximum_line/over_vector.csv 2923 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 17568 statement ok DROP TABLE csv_rejects_table; @@ -77,10 +77,10 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -data/csv/rejects/maximum_line/max_10.csv 5 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 -data/csv/rejects/maximum_line/over_vector.csv 2282 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 13684 -data/csv/rejects/maximum_line/over_vector.csv 2591 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 15557 -data/csv/rejects/maximum_line/over_vector.csv 2923 0 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 17568 +data/csv/rejects/maximum_line/max_10.csv 5 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 +data/csv/rejects/maximum_line/over_vector.csv 2282 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 13684 +data/csv/rejects/maximum_line/over_vector.csv 2591 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 15557 +data/csv/rejects/maximum_line/over_vector.csv 2923 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 17568 statement ok DROP TABLE csv_rejects_table; \ No newline at end of file diff --git a/test/sql/copy/csv/rejects/csv_rejects_read.test b/test/sql/copy/csv/rejects/csv_rejects_read.test index 8cf3d5ac883f..9917965558ba 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_read.test +++ b/test/sql/copy/csv/rejects/csv_rejects_read.test @@ -20,7 +20,7 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/error/mismatch/bad.csv 2 1 "col1" CAST 4,BBB,9, 9 +test/sql/copy/csv/data/error/mismatch/bad.csv 2 2 "col1" CAST 4,BBB,9, 9 query I SELECT error_message @@ -45,25 +45,25 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/error/mismatch/bad2.csv 1 2 "col2" CAST 1,2,DDD, 0 -test/sql/copy/csv/data/error/mismatch/bad2.csv 3 0 "col0" CAST EEE,7,FFF, 16 -test/sql/copy/csv/data/error/mismatch/bad2.csv 3 2 "col2" CAST EEE,7,FFF, 16 +test/sql/copy/csv/data/error/mismatch/bad2.csv 1 3 "col2" CAST 1,2,DDD, 0 +test/sql/copy/csv/data/error/mismatch/bad2.csv 3 1 "col0" CAST EEE,7,FFF, 16 +test/sql/copy/csv/data/error/mismatch/bad2.csv 3 3 "col2" CAST EEE,7,FFF, 16 query I SELECT error_message -FROM csv_rejects_table where line=1 and column_idx=2; +FROM csv_rejects_table where line=1 and column_idx=3; ---- :.*Could not convert string "DDD" to 'INTEGER'.* query I SELECT error_message -FROM csv_rejects_table where line=3 and column_idx=0; +FROM csv_rejects_table where line=3 and column_idx=1; ---- :.*Could not convert string "EEE" to 'INTEGER'.* query I SELECT error_message -FROM csv_rejects_table where line=3 and column_idx=2; +FROM csv_rejects_table where line=3 and column_idx=3; ---- :.*Could not convert string "FFF" to 'INTEGER'.* @@ -88,18 +88,18 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/error/mismatch/bad.csv 2 1 "col1" CAST 4,BBB,9, 9 -test/sql/copy/csv/data/error/mismatch/bad2.csv 3 0 "col0" CAST EEE,7,FFF, 16 +test/sql/copy/csv/data/error/mismatch/bad.csv 2 2 "col1" CAST 4,BBB,9, 9 +test/sql/copy/csv/data/error/mismatch/bad2.csv 3 1 "col0" CAST EEE,7,FFF, 16 query I SELECT error_message -FROM csv_rejects_table where line=2 and column_idx=1; +FROM csv_rejects_table where line=2 and column_idx=2; ---- :.*Could not convert string "BBB" to 'INTEGER'.* query I SELECT error_message -FROM csv_rejects_table where line=3 and column_idx=0; +FROM csv_rejects_table where line=3 and column_idx=1; ---- :.*Could not convert string "EEE" to 'INTEGER'.* @@ -144,18 +144,18 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 0 "num" CAST B, A 10875 -test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 0 "num" CAST C, A 20875 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 1 "num" CAST B, A 10875 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 1 "num" CAST C, A 20875 query I SELECT error_message -FROM csv_rejects_table where line=2176 and column_idx=0; +FROM csv_rejects_table where line=2176 and column_idx=1; ---- :.*Could not convert string "B" to 'INTEGER'.* query I SELECT error_message -FROM csv_rejects_table where line=4176 and column_idx=0; +FROM csv_rejects_table where line=4176 and column_idx=1; ---- :.*Could not convert string "C" to 'INTEGER'.* @@ -175,18 +175,18 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 0 "num" CAST B, A 18395 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 0 "num" CAST C, A 28395 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 1 "num" CAST B, A 18395 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 1 "num" CAST C, A 28395 query I SELECT error_message -FROM csv_rejects_table where line=3680 and column_idx=0; +FROM csv_rejects_table where line=3680 and column_idx=1; ---- :.*Could not convert string "B" to 'INTEGER'.* query I SELECT error_message -FROM csv_rejects_table where line=5680 and column_idx=0; +FROM csv_rejects_table where line=5680 and column_idx=1; ---- :.*Could not convert string "C" to 'INTEGER'.* @@ -207,32 +207,32 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 0 "num" CAST B, A 10875 -test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 0 "num" CAST C, A 20875 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 0 "num" CAST B, A 18395 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 0 "num" CAST C, A 28395 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 1 "num" CAST B, A 10875 +test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 1 "num" CAST C, A 20875 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 1 "num" CAST B, A 18395 +test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 1 "num" CAST C, A 28395 query I SELECT error_message -FROM csv_rejects_table where line=3680 and column_idx=0; +FROM csv_rejects_table where line=3680 and column_idx=1; ---- :.*Could not convert string "B" to 'INTEGER'.* query I SELECT error_message -FROM csv_rejects_table where line=5680 and column_idx=0; +FROM csv_rejects_table where line=5680 and column_idx=1; ---- :.*Could not convert string "C" to 'INTEGER'.* query I SELECT error_message -FROM csv_rejects_table where line=2176 and column_idx=0; +FROM csv_rejects_table where line=2176 and column_idx=1; ---- :.*Could not convert string "B" to 'INTEGER'.* query I SELECT error_message -FROM csv_rejects_table where line=4176 and column_idx=0; +FROM csv_rejects_table where line=4176 and column_idx=1; ---- :.*Could not convert string "C" to 'INTEGER'.* @@ -261,37 +261,37 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table_left; ---- -test/sql/copy/csv/data/error/mismatch/small1.csv 3 0 "num" CAST X,Y 14 -test/sql/copy/csv/data/error/mismatch/small1.csv 6 0 "num" CAST X,Y 26 +test/sql/copy/csv/data/error/mismatch/small1.csv 3 1 "num" CAST X,Y 14 +test/sql/copy/csv/data/error/mismatch/small1.csv 6 1 "num" CAST X,Y 26 query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table_right; ---- -test/sql/copy/csv/data/error/mismatch/small2.csv 3 0 "num" CAST X,Y 14 -test/sql/copy/csv/data/error/mismatch/small2.csv 5 0 "num" CAST X,Y 22 +test/sql/copy/csv/data/error/mismatch/small2.csv 3 1 "num" CAST X,Y 14 +test/sql/copy/csv/data/error/mismatch/small2.csv 5 1 "num" CAST X,Y 22 query I SELECT error_message -FROM csv_rejects_table_left where line=3 and column_idx=0; +FROM csv_rejects_table_left where line=3 and column_idx=1; ---- :.*Could not convert string "X" to 'INTEGER'.* query I SELECT error_message -FROM csv_rejects_table_left where line=6 and column_idx=0; +FROM csv_rejects_table_left where line=6 and column_idx=1; ---- :.*Could not convert string "X" to 'INTEGER'.* query I SELECT error_message -FROM csv_rejects_table_right where line=3 and column_idx=0; +FROM csv_rejects_table_right where line=3 and column_idx=1; ---- :.*Could not convert string "X" to 'INTEGER'.* query I SELECT error_message -FROM csv_rejects_table_right where line=5 and column_idx=0; +FROM csv_rejects_table_right where line=5 and column_idx=1; ---- :.*Could not convert string "X" to 'INTEGER'.* @@ -326,8 +326,8 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table_left; ---- -test/sql/copy/csv/data/error/mismatch/small1.csv 3 0 "num" CAST X,Y 14 -test/sql/copy/csv/data/error/mismatch/small1.csv 6 0 "num" CAST X,Y 26 +test/sql/copy/csv/data/error/mismatch/small1.csv 3 1 "num" CAST X,Y 14 +test/sql/copy/csv/data/error/mismatch/small1.csv 6 1 "num" CAST X,Y 26 query I SELECT COUNT(*) diff --git a/test/sql/copy/csv/rejects/csv_unquoted_rejects.test b/test/sql/copy/csv/rejects/csv_unquoted_rejects.test index 976b66d0ff05..0ce1d845df64 100644 --- a/test/sql/copy/csv/rejects/csv_unquoted_rejects.test +++ b/test/sql/copy/csv/rejects/csv_unquoted_rejects.test @@ -25,7 +25,7 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -data/csv/rejects/unquoted/basic.csv 5 0 "a" UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 28 +data/csv/rejects/unquoted/basic.csv 5 1 "a" UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 28 statement ok DROP TABLE csv_rejects_table; @@ -48,7 +48,7 @@ query IIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, byte_position FROM csv_rejects_table; ---- -data/csv/rejects/unquoted/unquoted_new_line.csv 5 0 "a" UNQUOTED VALUE 28 +data/csv/rejects/unquoted/unquoted_new_line.csv 5 1 "a" UNQUOTED VALUE 28 statement ok DROP TABLE csv_rejects_table; @@ -69,7 +69,7 @@ query IIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, byte_position FROM csv_rejects_table; ---- -data/csv/rejects/unquoted/unquoted_last_value.csv 5 0 "a" UNQUOTED VALUE 31 +data/csv/rejects/unquoted/unquoted_last_value.csv 5 1 "a" UNQUOTED VALUE 31 statement ok DROP TABLE csv_rejects_table; diff --git a/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test index 52ff0ac19823..94c56cc71562 100644 --- a/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test +++ b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test @@ -15,7 +15,7 @@ query IIIIIII rowsort SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position FROM csv_rejects_table; ---- -test/sql/copy/csv/data/test/invalid_utf_big.csv 3001 2 "col3" INVALID UNICODE valid,invalid_??_part,valid 54000 -test/sql/copy/csv/data/test/invalid_utf_big.csv 3012 2 "col3" INVALID UNICODE valid,valid,invalid_??_part 54208 -test/sql/copy/csv/data/test/invalid_utf_big.csv 3023 2 "col3" INVALID UNICODE valid,invalid_??_part,valid 54416 -test/sql/copy/csv/data/test/invalid_utf_big.csv 3034 2 "col3" INVALID UNICODE valid,valid,invalid_??_part 54624 +test/sql/copy/csv/data/test/invalid_utf_big.csv 3001 2 "col2" INVALID UNICODE valid,invalid_??_part,valid 54000 +test/sql/copy/csv/data/test/invalid_utf_big.csv 3012 3 "col3" INVALID UNICODE valid,valid,invalid_??_part 54208 +test/sql/copy/csv/data/test/invalid_utf_big.csv 3023 2 "col2" INVALID UNICODE valid,invalid_??_part,valid 54416 +test/sql/copy/csv/data/test/invalid_utf_big.csv 3034 3 "col3" INVALID UNICODE valid,valid,invalid_??_part 54624 \ No newline at end of file From ba1da6a688803300c39610fe03be73b145e86af4 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 8 Mar 2024 14:38:32 +0100 Subject: [PATCH 034/147] Handle invalid states in overbuffer values --- .../scanner/string_value_scanner.cpp | 6 +++ .../csv_scanner/csv_state_machine.hpp | 4 ++ .../csv/rejects/csv_unquoted_rejects.test | 47 ++++++++++--------- 3 files changed, 36 insertions(+), 21 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 0dea719fb455..ffdf15c5d523 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -926,6 +926,9 @@ void StringValueScanner::ProcessOverbufferValue() { if (states.IsEscaped()) { result.escaped = true; } + if (states.IsInvalid()) { + result.InvalidState(result); + } j++; } if (overbuffer_string.empty() && @@ -955,6 +958,9 @@ void StringValueScanner::ProcessOverbufferValue() { if (states.IsEscaped()) { result.escaped = true; } + if (states.IsInvalid()) { + result.InvalidState(result); + } j++; } string_t value; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine.hpp index 49542782f56a..a1628e100f63 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine.hpp @@ -54,6 +54,10 @@ struct CSVStates { return states[1] == CSVState::CARRIAGE_RETURN; } + inline bool IsInvalid() { + return states[1] == CSVState::INVALID; + } + inline bool IsQuoted() { return states[0] == CSVState::QUOTED; } diff --git a/test/sql/copy/csv/rejects/csv_unquoted_rejects.test b/test/sql/copy/csv/rejects/csv_unquoted_rejects.test index 0ce1d845df64..13c13b8b9fa7 100644 --- a/test/sql/copy/csv/rejects/csv_unquoted_rejects.test +++ b/test/sql/copy/csv/rejects/csv_unquoted_rejects.test @@ -74,24 +74,29 @@ data/csv/rejects/unquoted/unquoted_last_value.csv 5 1 "a" UNQUOTED VALUE 31 statement ok DROP TABLE csv_rejects_table; -# Test buffer sizes (borked :( ) -# -#loop buffer_size 35 1001 -# -#statement ok -#SELECT * FROM read_csv( -# 'data/csv/rejects/unquoted/basic.csv', -# columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, -# rejects_table='csv_rejects_table', -# ignore_errors=true, auto_detect=false, header = 1, quote = '"', escape = '"', buffer_size=${buffer_size}); -# -#query IIIIIII rowsort -#SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -#FROM csv_rejects_table; -#---- -#data/csv/rejects/unquoted/basic.csv 5 0 "a" UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 28 -# -#statement ok -#DROP TABLE csv_rejects_table; -# -#endloop \ No newline at end of file +loop buffer_size 35 40 + +query II +SELECT * FROM read_csv( + 'data/csv/rejects/unquoted/basic.csv', + columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, + rejects_table='csv_rejects_table', buffer_size=${buffer_size}, + ignore_errors=true, auto_detect=false, header = 1, quote = '"', escape = '"', buffer_size=35); +---- +bla 1 +bla 2 +bla 3 +bla 1 +bla 2 +bla 3 + +query IIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position +FROM csv_rejects_table; +---- +data/csv/rejects/unquoted/basic.csv 5 1 "a" UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 28 + +statement ok +DROP TABLE csv_rejects_table; + +endloop \ No newline at end of file From 9aa5d2a2088fad1da5b337677f777f0513ebc7fe Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 8 Mar 2024 15:22:22 +0100 Subject: [PATCH 035/147] Add a mixed test with a bunch of different errors --- data/csv/rejects/frankstein/nightmare.csv | 48 +++++++++++++ .../scanner/string_value_scanner.cpp | 1 + .../operator/csv_scanner/util/csv_error.cpp | 50 +++++++------- .../operator/csv_scanner/csv_error.hpp | 4 +- test/sql/copy/csv/rejects/test_mixed.test | 68 +++++++++++++++++++ 5 files changed, 144 insertions(+), 27 deletions(-) create mode 100644 data/csv/rejects/frankstein/nightmare.csv create mode 100644 test/sql/copy/csv/rejects/test_mixed.test diff --git a/data/csv/rejects/frankstein/nightmare.csv b/data/csv/rejects/frankstein/nightmare.csv new file mode 100644 index 000000000000..579f46a359b7 --- /dev/null +++ b/data/csv/rejects/frankstein/nightmare.csv @@ -0,0 +1,48 @@ +a,b,c +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2 +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro",5 +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,bla,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro"bla +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro thiago timbo holanda" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedroÿÿ" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" +1,2,"pedro" diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index ffdf15c5d523..e568a904b19c 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -450,6 +450,7 @@ bool StringValueResult::AddRowInternal() { CSVError::LineSizeError(state_machine.options, current_line_size, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); + number_of_rows--; } if (HandleError()) { return false; diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index fadfad6716ff..cfab97c9e55d 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -19,7 +19,11 @@ void CSVErrorHandler::ThrowError(CSVError csv_error) { if (PrintLineNumber(csv_error)) { error << "CSV Error on Line: " << GetLine(csv_error.error_info) << std::endl; } - error << csv_error.error_message; + if (csv_error.error_message_with_options.empty()){ + error << csv_error.error_message; + } else{ + error << csv_error.error_message_with_options; + } switch (csv_error.type) { case CSVErrorType::CAST_ERROR: throw ConversionException(error.str()); @@ -78,9 +82,15 @@ CSVError::CSVError(string error_message_p, CSVErrorType type_p, LinesPerBoundary } CSVError::CSVError(string error_message_p, CSVErrorType type_p, idx_t column_idx_p, string csv_row_p, - LinesPerBoundary error_info_p, idx_t byte_position_p) + LinesPerBoundary error_info_p, idx_t byte_position_p, const CSVReaderOptions &reader_options) : error_message(std::move(error_message_p)), type(type_p), column_idx(column_idx_p), csv_row(std::move(csv_row_p)), error_info(error_info_p), byte_position(byte_position_p) { + // What were the options + std::ostringstream error; + error << error_message << std::endl; + error << reader_options.ToString(); + error << std::endl; + error_message_with_options = error.str(); } CSVError CSVError::ColumnTypesError(case_insensitive_map_t sql_types_per_column, const vector &names) { @@ -107,22 +117,18 @@ CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_nam string &csv_row, LinesPerBoundary error_info, idx_t byte_position) { std::ostringstream error; // Which column - error << "Error when converting column \"" << column_name << "\"." << std::endl; + error << "Error when converting column \"" << column_name << "\". "; // What was the cast error - error << cast_error << std::endl; - error << std::endl; - // What were the options - error << options.ToString(); - return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, csv_row, error_info, byte_position); + error << cast_error; + return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, csv_row, error_info, byte_position, options); } CSVError CSVError::LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info, string &csv_row, idx_t byte_position) { std::ostringstream error; error << "Maximum line size of " << options.maximum_line_size << " bytes exceeded. "; - error << "Actual Size:" << actual_size << " bytes." << std::endl; - error << options.ToString(); - return CSVError(error.str(), CSVErrorType::MAXIMUM_LINE_SIZE, 0, csv_row, error_info, byte_position); + error << "Actual Size:" << actual_size << " bytes."; + return CSVError(error.str(), CSVErrorType::MAXIMUM_LINE_SIZE, 0, csv_row, error_info, byte_position, options); } CSVError CSVError::SniffingError(string &file_path) { @@ -146,26 +152,20 @@ CSVError CSVError::NullPaddingFail(const CSVReaderOptions &options, LinesPerBoun CSVError CSVError::UnterminatedQuotesError(const CSVReaderOptions &options, idx_t current_column, LinesPerBoundary error_info, string &csv_row, idx_t byte_position) { std::ostringstream error; - error << "Value with unterminated quote found." << std::endl; - error << std::endl; - // What were the options - error << options.ToString(); - return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, current_column, csv_row, error_info, byte_position); + error << "Value with unterminated quote found."; + return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, current_column, csv_row, error_info, byte_position, options); } CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, idx_t actual_columns, LinesPerBoundary error_info, string &csv_row, idx_t byte_position) { std::ostringstream error; // How many columns were expected and how many were found - error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns + 1 - << std::endl; - // What were the options - error << options.ToString(); + error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns + 1; if (actual_columns >= options.dialect_options.num_cols) { return CSVError(error.str(), CSVErrorType::TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, - byte_position); + byte_position, options); } else { - return CSVError(error.str(), CSVErrorType::TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, byte_position); + return CSVError(error.str(), CSVErrorType::TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, byte_position, options); } } @@ -173,10 +173,8 @@ CSVError CSVError::InvalidUTF8(const CSVReaderOptions &options, idx_t current_co string &csv_row, idx_t byte_position) { std::ostringstream error; // How many columns were expected and how many were found - error << "Invalid unicode (byte sequence mismatch) detected." << std::endl; - // What were the options - error << options.ToString(); - return CSVError(error.str(), CSVErrorType::INVALID_UNICODE, current_column, csv_row, error_info, byte_position); + error << "Invalid unicode (byte sequence mismatch) detected."; + return CSVError(error.str(), CSVErrorType::INVALID_UNICODE, current_column, csv_row, error_info, byte_position, options); } bool CSVErrorHandler::PrintLineNumber(CSVError &error) { diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index befc3a669219..98f460127d83 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -52,7 +52,7 @@ class CSVError { public: CSVError() {}; CSVError(string error_message, CSVErrorType type, idx_t column_idx, string csv_row, LinesPerBoundary error_info, - idx_t byte_position); + idx_t byte_position, const CSVReaderOptions &reader_options); CSVError(string error_message, CSVErrorType type, LinesPerBoundary error_info); //! Produces error messages for column name -> type mismatch. static CSVError ColumnTypesError(case_insensitive_map_t sql_types_per_column, const vector &names); @@ -81,6 +81,8 @@ class CSVError { //! Actual error message string error_message; + //! Actual error message + string error_message_with_options; //! Error Type CSVErrorType type; //! Column Index where error happened diff --git a/test/sql/copy/csv/rejects/test_mixed.test b/test/sql/copy/csv/rejects/test_mixed.test new file mode 100644 index 000000000000..45001a5e4b05 --- /dev/null +++ b/test/sql/copy/csv/rejects/test_mixed.test @@ -0,0 +1,68 @@ +# name: test/sql/copy/csv/rejects/test_mixed.test +# description: Tests a mix of all possible CSV Errors +# group: [rejects] + +require skip_reload + +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + +query III +SELECT * FROM read_csv( + 'data/csv/rejects/frankstein/nightmare.csv', + columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'VARCHAR'}, + rejects_table='csv_rejects_table', + ignore_errors=true, auto_detect=false, header = 1, max_line_size=20); +---- +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro +1 2 pedro + +query IIIIIIII rowsort +SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position, error_message +FROM csv_rejects_table; +---- +data/csv/rejects/frankstein/nightmare.csv 10 2 "c" MISSING COLUMNS 1,2 102 Expected Number of Columns: 3 Found: 2 +data/csv/rejects/frankstein/nightmare.csv 14 4 NULL TOO MANY COLUMNS 1,2,"pedro",5 142 Expected Number of Columns: 3 Found: 4 +data/csv/rejects/frankstein/nightmare.csv 19 2 "b" CAST 1,bla,"pedro" 204 Error when converting column "b". Could not convert string "bla" to 'INTEGER' +data/csv/rejects/frankstein/nightmare.csv 22 3 "c" UNQUOTED VALUE 1,2,"pedro"bla 242 Value with unterminated quote found. +data/csv/rejects/frankstein/nightmare.csv 32 1 "a" LINE SIZE OVER MAXIMUM 1,2,"pedro thiago timbo holanda" 365 Maximum line size of 20 bytes exceeded. Actual Size:33 bytes. +data/csv/rejects/frankstein/nightmare.csv 38 3 "c" INVALID UNICODE 1,2,"pedro??" 458 Invalid unicode (byte sequence mismatch) detected. \ No newline at end of file From 65cf2d4c9d94fab39e5cd3b791390a01964471a1 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 11 Mar 2024 14:56:14 +0100 Subject: [PATCH 036/147] [wip] changing to store rejects into two tables, add store_rejects flag --- .../scanner/string_value_scanner.cpp | 14 ++-- .../csv_scanner/sniffer/csv_sniffer.cpp | 4 +- .../csv_scanner/sniffer/dialect_detection.cpp | 12 +-- .../csv_scanner/sniffer/type_detection.cpp | 2 +- .../table_function/csv_file_scanner.cpp | 8 +- .../table_function/global_csv_state.cpp | 5 +- .../operator/csv_scanner/util/csv_error.cpp | 17 ++-- .../csv_scanner/util/csv_reader_options.cpp | 13 +-- .../operator/persistent/csv_rejects_table.cpp | 84 +++++++++++++------ src/function/table/read_csv.cpp | 20 ++--- .../operator/csv_scanner/csv_option.hpp | 2 +- .../csv_scanner/csv_reader_options.hpp | 6 +- .../operator/persistent/csv_rejects_table.hpp | 6 +- .../duckdb/storage/serialization/nodes.json | 6 +- src/storage/serialization/serialize_nodes.cpp | 8 +- .../csv/rejects/csv_rejects_double_table.test | 0 16 files changed, 119 insertions(+), 88 deletions(-) create mode 100644 test/sql/copy/csv/rejects/csv_rejects_double_table.test diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index e568a904b19c..ae9d470a32ce 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -17,7 +17,7 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m shared_ptr csv_file_scan_p, idx_t &lines_read_p, bool sniffing_p) : ScannerResult(states, state_machine), number_of_columns(NumericCast(state_machine.dialect_options.num_cols)), - null_padding(state_machine.options.null_padding), ignore_errors(state_machine.options.ignore_errors), + null_padding(state_machine.options.null_padding), ignore_errors(state_machine.options.ignore_errors.GetValue()), null_str_ptr(state_machine.options.null_str.c_str()), null_str_size(state_machine.options.null_str.size()), result_size(result_size_p), error_handler(error_hander_p), iterator(iterator_p), store_line_size(store_line_size_p), csv_file_scan(std::move(csv_file_scan_p)), lines_read(lines_read_p), @@ -219,7 +219,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size // By default we add a string // We only evaluate if a string is utf8 valid, if it's actually a varchar if (parse_types[chunk_col_id].second && !Utf8Proc::IsValid(value_ptr, UnsafeNumericCast(size))) { - bool force_error = !state_machine.options.ignore_errors && sniffing; + bool force_error = !state_machine.options.ignore_errors.GetValue() && sniffing; // Invalid unicode, we must error if (force_error) { HandleUnicodeError(cur_col_id, force_error); @@ -554,7 +554,7 @@ bool StringValueResult::AddRow(StringValueResult &result, const idx_t buffer_pos } void StringValueResult::InvalidState(StringValueResult &result) { - bool force_error = !result.state_machine.options.ignore_errors && result.sniffing; + bool force_error = !result.state_machine.options.ignore_errors.GetValue() && result.sniffing; // Invalid unicode, we must error if (force_error) { result.HandleUnicodeError(result.cur_col_id, force_error); @@ -721,7 +721,7 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { { vector row; - if (state_machine->options.ignore_errors) { + if (state_machine->options.ignore_errors.GetValue()) { for (idx_t col = 0; col < parse_chunk.ColumnCount(); col++) { row.push_back(parse_chunk.GetValue(col, line_error)); } @@ -739,7 +739,7 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { error_handler->Error(csv_error); } borked_lines.insert(line_error++); - D_ASSERT(state_machine->options.ignore_errors); + D_ASSERT(state_machine->options.ignore_errors.GetValue()); // We are ignoring errors. We must continue but ignoring borked rows for (; line_error < parse_chunk.size(); line_error++) { if (!inserted_column_data.validity.RowIsValid(line_error) && @@ -1184,7 +1184,7 @@ void StringValueScanner::SetStart() { if (iterator.pos.buffer_pos == cur_buffer_handle->actual_size || scan_finder->iterator.GetBufferIdx() >= iterator.GetBufferIdx()) { // Propagate any errors - if (!scan_finder->error_handler->errors.empty() && state_machine->options.ignore_errors) { + if (!scan_finder->error_handler->errors.empty() && state_machine->options.ignore_errors.GetValue()) { for (auto &error_vector : scan_finder->error_handler->errors) { for (auto &error : error_vector.second) { error_handler->Error(error); @@ -1202,7 +1202,7 @@ void StringValueScanner::SetStart() { } } while (!line_found); // Propagate any errors - if (!scan_finder->error_handler->errors.empty() && state_machine->options.ignore_errors) { + if (!scan_finder->error_handler->errors.empty() && state_machine->options.ignore_errors.GetValue()) { for (auto &error_vector : scan_finder->error_handler->errors) { for (auto &error : error_vector.second) { error_handler->Error(error); diff --git a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index 9009210359f1..238b56426b52 100644 --- a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -13,7 +13,7 @@ CSVSniffer::CSVSniffer(CSVReaderOptions &options_p, shared_ptr } // Initialize max columns found to either 0 or however many were set max_columns_found = set_columns.Size(); - error_handler = make_shared(options.ignore_errors); + error_handler = make_shared(options.ignore_errors.GetValue()); detection_error_handler = make_shared(true); } @@ -93,7 +93,7 @@ SnifferResult CSVSniffer::SniffCSV(bool force_match) { DetectHeader(); // 5. Type Replacement ReplaceTypes(); - if (!best_candidate->error_handler->errors.empty() && !options.ignore_errors) { + if (!best_candidate->error_handler->errors.empty() && !options.ignore_errors.GetValue()) { for (auto &error_vector : best_candidate->error_handler->errors) { for (auto &error : error_vector.second) { if (error.type == CSVErrorType::MAXIMUM_LINE_SIZE) { diff --git a/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp b/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp index 7e23a6d1cc4b..0f5a485adff3 100644 --- a/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +++ b/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp @@ -101,18 +101,19 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr scanner, if (sniffed_column_counts.result_position > rows_read) { rows_read = sniffed_column_counts.result_position; } - if (set_columns.IsCandidateUnacceptable(num_cols, options.null_padding, options.ignore_errors, + if (set_columns.IsCandidateUnacceptable(num_cols, options.null_padding, options.ignore_errors.GetValue(), sniffed_column_counts.last_value_always_empty)) { // Not acceptable return; } for (idx_t row = start_row; row < sniffed_column_counts.result_position; row++) { - if (set_columns.IsCandidateUnacceptable(sniffed_column_counts[row], options.null_padding, options.ignore_errors, + if (set_columns.IsCandidateUnacceptable(sniffed_column_counts[row], options.null_padding, + options.ignore_errors.GetValue(), sniffed_column_counts.last_value_always_empty)) { // Not acceptable return; } - if (sniffed_column_counts[row] == num_cols || options.ignore_errors) { + if (sniffed_column_counts[row] == num_cols || options.ignore_errors.GetValue()) { consistent_rows++; } else if (num_cols < sniffed_column_counts[row] && !options.dialect_options.skip_rows.IsSetByUser() && (!set_columns.IsSet() || options.null_padding)) { @@ -212,10 +213,11 @@ bool CSVSniffer::RefineCandidateNextChunk(ColumnCountScanner &candidate) { for (idx_t i = 0; i < sniffed_column_counts.result_position; i++) { if (set_columns.IsSet()) { return !set_columns.IsCandidateUnacceptable(sniffed_column_counts[i], options.null_padding, - options.ignore_errors, + options.ignore_errors.GetValue(), sniffed_column_counts.last_value_always_empty); } else { - if (max_columns_found != sniffed_column_counts[i] && (!options.null_padding && !options.ignore_errors)) { + if (max_columns_found != sniffed_column_counts[i] && + (!options.null_padding && !options.ignore_errors.GetValue())) { return false; } } diff --git a/src/execution/operator/csv_scanner/sniffer/type_detection.cpp b/src/execution/operator/csv_scanner/sniffer/type_detection.cpp index 717472b3c211..fe1bf8644776 100644 --- a/src/execution/operator/csv_scanner/sniffer/type_detection.cpp +++ b/src/execution/operator/csv_scanner/sniffer/type_detection.cpp @@ -273,7 +273,7 @@ void CSVSniffer::DetectTypes() { // it's good if the dialect creates more non-varchar columns, but only if we sacrifice < 30% of // best_num_cols. if (varchar_cols < min_varchar_cols && info_sql_types_candidates.size() > (max_columns_found * 0.7) && - (!options.ignore_errors || candidate->error_handler->errors.size() < min_errors)) { + (!options.ignore_errors.GetValue() || candidate->error_handler->errors.size() < min_errors)) { min_errors = candidate->error_handler->errors.size(); best_header_row.clear(); // we have a new best_options candidate diff --git a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp index 641cbdd06818..807fe700a402 100644 --- a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp +++ b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp @@ -10,7 +10,7 @@ CSVFileScan::CSVFileScan(ClientContext &context, shared_ptr bu vector &file_schema) : file_path(options_p.file_path), file_idx(0), buffer_manager(std::move(buffer_manager_p)), state_machine(std::move(state_machine_p)), file_size(buffer_manager->file_handle->FileSize()), - error_handler(make_shared(options_p.ignore_errors)), + error_handler(make_shared(options_p.ignore_errors.GetValue())), on_disk_file(buffer_manager->file_handle->OnDiskFile()), options(options_p) { if (bind_data.initial_reader.get()) { auto &union_reader = *bind_data.initial_reader; @@ -43,7 +43,7 @@ CSVFileScan::CSVFileScan(ClientContext &context, const string &file_path_p, cons const idx_t file_idx_p, const ReadCSVData &bind_data, const vector &column_ids, const vector &file_schema) : file_path(file_path_p), file_idx(file_idx_p), - error_handler(make_shared(options_p.ignore_errors)), options(options_p) { + error_handler(make_shared(options_p.ignore_errors.GetValue())), options(options_p) { if (file_idx < bind_data.union_readers.size()) { // we are doing UNION BY NAME - fetch the options from the union reader for this file optional_ptr union_reader_ptr; @@ -129,8 +129,8 @@ CSVFileScan::CSVFileScan(ClientContext &context, const string &file_path_p, cons } CSVFileScan::CSVFileScan(ClientContext &context, const string &file_name, CSVReaderOptions &options_p) - : file_path(file_name), file_idx(0), error_handler(make_shared(options_p.ignore_errors)), - options(options_p) { + : file_path(file_name), file_idx(0), + error_handler(make_shared(options_p.ignore_errors.GetValue())), options(options_p) { buffer_manager = make_shared(context, options, file_path, file_idx); // Initialize On Disk and Size of file on_disk_file = buffer_manager->file_handle->OnDiskFile(); diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 1d012349da21..4446a670e5a5 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -173,10 +173,9 @@ string CSVErrorTypeToEnum(CSVErrorType type) { void CSVGlobalState::FillRejectsTable() { auto &options = bind_data.options; - if (!options.rejects_table_name.empty()) { + if (options.store_rejects) { auto limit = options.rejects_limit; - - auto rejects = CSVRejectsTable::GetOrCreate(context, options.rejects_table_name); + auto rejects = CSVRejectsTable::GetOrCreate(context); lock_guard lock(rejects->write_lock); auto &table = rejects->GetTable(context); InternalAppender appender(context, table); diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index cfab97c9e55d..7a8349288c07 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -19,9 +19,9 @@ void CSVErrorHandler::ThrowError(CSVError csv_error) { if (PrintLineNumber(csv_error)) { error << "CSV Error on Line: " << GetLine(csv_error.error_info) << std::endl; } - if (csv_error.error_message_with_options.empty()){ + if (csv_error.error_message_with_options.empty()) { error << csv_error.error_message; - } else{ + } else { error << csv_error.error_message_with_options; } switch (csv_error.type) { @@ -153,7 +153,8 @@ CSVError CSVError::UnterminatedQuotesError(const CSVReaderOptions &options, idx_ LinesPerBoundary error_info, string &csv_row, idx_t byte_position) { std::ostringstream error; error << "Value with unterminated quote found."; - return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, current_column, csv_row, error_info, byte_position, options); + return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, current_column, csv_row, error_info, byte_position, + options); } CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, idx_t actual_columns, @@ -162,10 +163,11 @@ CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, i // How many columns were expected and how many were found error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns + 1; if (actual_columns >= options.dialect_options.num_cols) { - return CSVError(error.str(), CSVErrorType::TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, - byte_position, options); + return CSVError(error.str(), CSVErrorType::TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, byte_position, + options); } else { - return CSVError(error.str(), CSVErrorType::TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, byte_position, options); + return CSVError(error.str(), CSVErrorType::TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, byte_position, + options); } } @@ -174,7 +176,8 @@ CSVError CSVError::InvalidUTF8(const CSVReaderOptions &options, idx_t current_co std::ostringstream error; // How many columns were expected and how many were found error << "Invalid unicode (byte sequence mismatch) detected."; - return CSVError(error.str(), CSVErrorType::INVALID_UNICODE, current_column, csv_row, error_info, byte_position, options); + return CSVError(error.str(), CSVErrorType::INVALID_UNICODE, current_column, csv_row, error_info, byte_position, + options); } bool CSVErrorHandler::PrintLineNumber(CSVError &error) { diff --git a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp index 72c73a2e5bac..849c0e97ec52 100644 --- a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +++ b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp @@ -189,7 +189,7 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, string format = ParseString(value, loption); SetDateFormat(LogicalTypeId::TIMESTAMP, format, true); } else if (loption == "ignore_errors") { - ignore_errors = ParseBoolean(value, loption); + ignore_errors.Set(ParseBoolean(value, loption)); } else if (loption == "buffer_size") { buffer_size = ParseInteger(value, loption); if (buffer_size == 0) { @@ -206,13 +206,8 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, parallel = ParseBoolean(value, loption); } else if (loption == "allow_quoted_nulls") { allow_quoted_nulls = ParseBoolean(value, loption); - } else if (loption == "rejects_table") { - // skip, handled in SetRejectsOptions - auto table_name = ParseString(value, loption); - if (table_name.empty()) { - throw BinderException("REJECTS_TABLE option cannot be empty"); - } - rejects_table_name = table_name; + } else if (loption == "store_rejects") { + store_rejects = ParseBoolean(value, loption); } else if (loption == "rejects_limit") { int64_t limit = ParseInteger(value, loption); if (limit < 0) { @@ -323,7 +318,7 @@ string CSVReaderOptions::ToString() const { // sample_size error += "sample_size=" + std::to_string(sample_size_chunks * STANDARD_VECTOR_SIZE) + "\n "; // ignore_errors - error += "ignore_errors=" + std::to_string(ignore_errors) + "\n "; + error += "ignore_errors=" + ignore_errors.FormatValue() + "\n "; // all_varchar error += "all_varchar=" + std::to_string(all_varchar) + "\n"; diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index a2f80d855d15..939ce125f39a 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -9,14 +9,14 @@ namespace duckdb { TableCatalogEntry &CSVRejectsTable::GetTable(ClientContext &context) { auto &temp_catalog = Catalog::GetCatalog(context, TEMP_CATALOG); - auto &table_entry = temp_catalog.GetEntry(context, TEMP_CATALOG, DEFAULT_SCHEMA, name); + auto &table_entry = temp_catalog.GetEntry(context, TEMP_CATALOG, DEFAULT_SCHEMA, "reject_scans"); return table_entry; } -shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, const string &name) { - auto key = "CSV_REJECTS_TABLE_CACHE_ENTRY_" + StringUtil::Upper(name); +shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context) { + auto key = "CSV_REJECTS_TABLE_CACHE_ENTRY"; auto &cache = ObjectCache::GetObjectCache(context); - return cache.GetOrCreate(key, name); + return cache.GetOrCreate(key); } void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData &data) { @@ -38,27 +38,61 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData type_info->on_conflict = OnCreateConflict::IGNORE_ON_CONFLICT; catalog.CreateType(context, *type_info); - // Create Rejects Table - auto info = make_uniq(TEMP_CATALOG, DEFAULT_SCHEMA, name); - info->temporary = true; - info->on_conflict = OnCreateConflict::ERROR_ON_CONFLICT; - // 1. File Path - info->columns.AddColumn(ColumnDefinition("file", LogicalType::VARCHAR)); - // 2. Row Line - info->columns.AddColumn(ColumnDefinition("line", LogicalType::UBIGINT)); - // 3. Byte Position where error occurred - info->columns.AddColumn(ColumnDefinition("byte_position", LogicalType::UBIGINT)); - // 4. Column Index (If Applicable) - info->columns.AddColumn(ColumnDefinition("column_idx", LogicalType::UBIGINT)); - // 5. Column Name (If Applicable) - info->columns.AddColumn(ColumnDefinition("column_name", LogicalType::VARCHAR)); - // 6. Error Type - info->columns.AddColumn(ColumnDefinition("error_type", enum_type)); - // 7. Original CSV Line - info->columns.AddColumn(ColumnDefinition("csv_line", LogicalType::VARCHAR)); - // 8. Full Error Message - info->columns.AddColumn(ColumnDefinition("error_message", LogicalType::VARCHAR)); - catalog.CreateTable(context, std::move(info)); + // Create Rejects Scans Table + { + auto info = make_uniq(TEMP_CATALOG, DEFAULT_SCHEMA, "reject_scans"); + info->temporary = true; + info->on_conflict = OnCreateConflict::ERROR_ON_CONFLICT; + // 0. Scan ID + info->columns.AddColumn(ColumnDefinition("scan_id", LogicalType::UBIGINT)); + // 1. File Path + info->columns.AddColumn(ColumnDefinition("file_path", LogicalType::VARCHAR)); + // 2. Delimiter + info->columns.AddColumn(ColumnDefinition("delimiter", LogicalType::VARCHAR)); + // 3. Quote + info->columns.AddColumn(ColumnDefinition("quote", LogicalType::VARCHAR)); + // 4. Escape + info->columns.AddColumn(ColumnDefinition("escape", LogicalType::VARCHAR)); + // 5. NewLine Delimiter + info->columns.AddColumn(ColumnDefinition("newline_delimiter", LogicalType::VARCHAR)); + // 6. Skip Rows + info->columns.AddColumn(ColumnDefinition("skip_rows", LogicalType::UINTEGER)); + // 7. Has Header + info->columns.AddColumn(ColumnDefinition("has_header", LogicalType::BOOLEAN)); + // 8. List> + info->columns.AddColumn(ColumnDefinition("columns", LogicalType::VARCHAR)); + // 9. Date Format + info->columns.AddColumn(ColumnDefinition("date_format", LogicalType::VARCHAR)); + // 10. Timestamp Format + info->columns.AddColumn(ColumnDefinition("timestamp_format", LogicalType::VARCHAR)); + // 11. CSV read function with all the options used + info->columns.AddColumn(ColumnDefinition("user_arguments", LogicalType::VARCHAR)); + // 12. CSV read function with all the options used + info->columns.AddColumn(ColumnDefinition("prompt", LogicalType::VARCHAR)); + catalog.CreateTable(context, std::move(info)); + } + { + // Create Rejects Error Table + auto info = make_uniq(TEMP_CATALOG, DEFAULT_SCHEMA, "reject_errors"); + info->temporary = true; + info->on_conflict = OnCreateConflict::ERROR_ON_CONFLICT; + // 1. Row Line + info->columns.AddColumn(ColumnDefinition("line", LogicalType::UBIGINT)); + // 2. Byte Position where error occurred + info->columns.AddColumn(ColumnDefinition("byte_position", LogicalType::UBIGINT)); + // 3. Column Index (If Applicable) + info->columns.AddColumn(ColumnDefinition("column_idx", LogicalType::UBIGINT)); + // 4. Column Name (If Applicable) + info->columns.AddColumn(ColumnDefinition("column_name", LogicalType::VARCHAR)); + // 5. Error Type + info->columns.AddColumn(ColumnDefinition("error_type", enum_type)); + // 6. Original CSV Line + info->columns.AddColumn(ColumnDefinition("csv_line", LogicalType::VARCHAR)); + // 7. Full Error Message + info->columns.AddColumn(ColumnDefinition("error_message", LogicalType::VARCHAR)); + catalog.CreateTable(context, std::move(info)); + } + count = 0; } diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index 1b5d33f4df5b..2c691aa8bd21 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -53,19 +53,18 @@ static unique_ptr ReadCSVBind(ClientContext &context, TableFunctio options.FromNamedParameters(input.named_parameters, context, return_types, names); // Validate rejects_table options - if (!options.rejects_table_name.empty()) { - if (!options.ignore_errors) { + if (options.store_rejects) { + if (!options.ignore_errors.GetValue() && options.ignore_errors.IsSetByUser()) { throw BinderException("REJECTS_TABLE option is only supported when IGNORE_ERRORS is set to true"); } + // Ensure we set ignore errors to true automagically + options.ignore_errors.Set(true, false); if (options.file_options.union_by_name) { throw BinderException("REJECTS_TABLE option is not supported when UNION_BY_NAME is set to true"); } } - - if (options.rejects_limit != 0) { - if (options.rejects_table_name.empty()) { - throw BinderException("REJECTS_LIMIT option is only supported when REJECTS_TABLE is set to a table name"); - } + if (options.rejects_limit != 0 && !options.store_rejects) { + throw BinderException("REJECTS_LIMIT option is only supported when REJECTS_TABLE is set to a table name"); } options.file_options.AutoDetectHivePartitioning(result->files, context); @@ -146,9 +145,8 @@ static unique_ptr ReadCSVInitGlobal(ClientContext &con auto &bind_data = input.bind_data->Cast(); // Create the temporary rejects table - auto rejects_table = bind_data.options.rejects_table_name; - if (!rejects_table.empty()) { - CSVRejectsTable::GetOrCreate(context, rejects_table)->InitializeTable(context, bind_data); + if (bind_data.options.store_rejects) { + CSVRejectsTable::GetOrCreate(context)->InitializeTable(context, bind_data); } if (bind_data.files.empty()) { // This can happen when a filename based filter pushdown has eliminated all possible files for this scan. @@ -228,7 +226,7 @@ void ReadCSVTableFunction::ReadCSVAddNamedParameters(TableFunction &table_functi table_function.named_parameters["max_line_size"] = LogicalType::VARCHAR; table_function.named_parameters["maximum_line_size"] = LogicalType::VARCHAR; table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN; - table_function.named_parameters["rejects_table"] = LogicalType::VARCHAR; + table_function.named_parameters["store_rejects"] = LogicalType::BOOLEAN; table_function.named_parameters["rejects_limit"] = LogicalType::BIGINT; table_function.named_parameters["buffer_size"] = LogicalType::UBIGINT; table_function.named_parameters["decimal_separator"] = LogicalType::VARCHAR; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_option.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_option.hpp index 8c13e2c9f15f..57386f857963 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_option.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_option.hpp @@ -73,7 +73,7 @@ struct CSVOption { return value != other; } //! Returns CSV Option value - const T GetValue() const { + inline const T GetValue() const { return value; } bool IsSetByUser() const { diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp index ee06436ed9d6..436802909c82 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp @@ -40,9 +40,9 @@ struct CSVReaderOptions { //! See struct above. DialectOptions dialect_options; //! Whether or not we should ignore InvalidInput errors - bool ignore_errors = false; - //! Rejects table name - string rejects_table_name; + CSVOption ignore_errors = false; + //! Whether we store CSV Errors or not + bool store_rejects = false; //! Rejects table entry limit (0 = no limit) idx_t rejects_limit = 0; //! Number of samples to buffer diff --git a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp index 12c9bc61345e..bb4ff62fa4ae 100644 --- a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp +++ b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp @@ -14,14 +14,14 @@ class ClientContext; class CSVRejectsTable : public ObjectCacheEntry { public: - CSVRejectsTable(string name) : name(name), count(0) { + CSVRejectsTable() : count(0) { } ~CSVRejectsTable() override = default; mutex write_lock; - string name; + idx_t count; - static shared_ptr GetOrCreate(ClientContext &context, const string &name); + static shared_ptr GetOrCreate(ClientContext &context); void InitializeTable(ClientContext &context, const ReadCSVData &options); TableCatalogEntry &GetTable(ClientContext &context); diff --git a/src/include/duckdb/storage/serialization/nodes.json b/src/include/duckdb/storage/serialization/nodes.json index c601768a93cd..6dfb0b003f15 100644 --- a/src/include/duckdb/storage/serialization/nodes.json +++ b/src/include/duckdb/storage/serialization/nodes.json @@ -537,7 +537,7 @@ "members": [ {"id": 100, "name": "ignore_errors", - "type": "bool" + "type": "CSVOption" }, {"id": 101, "name": "buffer_sample_size", @@ -604,8 +604,8 @@ "type": "vector" }, {"id": 117, - "name": "rejects_table_name", - "type": "string" + "name": "store_rejects", + "type": "bool" }, {"id": 118, "name": "rejects_limit", diff --git a/src/storage/serialization/serialize_nodes.cpp b/src/storage/serialization/serialize_nodes.cpp index 96b233b8d6ca..c274e2a2b2ac 100644 --- a/src/storage/serialization/serialize_nodes.cpp +++ b/src/storage/serialization/serialize_nodes.cpp @@ -118,7 +118,7 @@ CSVOption CSVOption::Deserialize(Deserializer &deserializer) { } void CSVReaderOptions::Serialize(Serializer &serializer) const { - serializer.WritePropertyWithDefault(100, "ignore_errors", ignore_errors); + serializer.WriteProperty>(100, "ignore_errors", ignore_errors); serializer.WritePropertyWithDefault(101, "buffer_sample_size", buffer_sample_size); serializer.WritePropertyWithDefault(102, "null_str", null_str); serializer.WriteProperty(103, "compression", compression); @@ -135,7 +135,7 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault(114, "buffer_size", buffer_size); serializer.WriteProperty(115, "file_options", file_options); serializer.WritePropertyWithDefault>(116, "force_quote", force_quote); - serializer.WritePropertyWithDefault(117, "rejects_table_name", rejects_table_name); + serializer.WritePropertyWithDefault(117, "store_rejects", store_rejects); serializer.WritePropertyWithDefault(118, "rejects_limit", rejects_limit); serializer.WriteProperty>(119, "dialect_options.state_machine_options.delimiter", dialect_options.state_machine_options.delimiter); serializer.WriteProperty>(120, "dialect_options.state_machine_options.quote", dialect_options.state_machine_options.quote); @@ -151,7 +151,7 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { CSVReaderOptions result; - deserializer.ReadPropertyWithDefault(100, "ignore_errors", result.ignore_errors); + deserializer.ReadProperty>(100, "ignore_errors", result.ignore_errors); deserializer.ReadPropertyWithDefault(101, "buffer_sample_size", result.buffer_sample_size); deserializer.ReadPropertyWithDefault(102, "null_str", result.null_str); deserializer.ReadProperty(103, "compression", result.compression); @@ -168,7 +168,7 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { deserializer.ReadPropertyWithDefault(114, "buffer_size", result.buffer_size); deserializer.ReadProperty(115, "file_options", result.file_options); deserializer.ReadPropertyWithDefault>(116, "force_quote", result.force_quote); - deserializer.ReadPropertyWithDefault(117, "rejects_table_name", result.rejects_table_name); + deserializer.ReadPropertyWithDefault(117, "store_rejects", result.store_rejects); deserializer.ReadPropertyWithDefault(118, "rejects_limit", result.rejects_limit); deserializer.ReadProperty>(119, "dialect_options.state_machine_options.delimiter", result.dialect_options.state_machine_options.delimiter); deserializer.ReadProperty>(120, "dialect_options.state_machine_options.quote", result.dialect_options.state_machine_options.quote); diff --git a/test/sql/copy/csv/rejects/csv_rejects_double_table.test b/test/sql/copy/csv/rejects/csv_rejects_double_table.test new file mode 100644 index 000000000000..e69de29bb2d1 From 05bb2db09a4f309d4882335f77271e8cc3afc255 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 11 Mar 2024 15:32:13 +0100 Subject: [PATCH 037/147] More on making options magically work --- .../table_function/global_csv_state.cpp | 4 ++-- .../csv_scanner/util/csv_reader_options.cpp | 7 +++++++ .../operator/persistent/csv_rejects_table.cpp | 12 +++++------ src/function/table/read_csv.cpp | 21 +++++++++++++------ .../csv_scanner/csv_reader_options.hpp | 6 ++++-- .../operator/persistent/csv_rejects_table.hpp | 15 ++++++++++--- .../duckdb/storage/serialization/nodes.json | 6 +++++- src/storage/serialization/serialize_nodes.cpp | 6 ++++-- 8 files changed, 55 insertions(+), 22 deletions(-) diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 4446a670e5a5..00ba18b82f00 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -173,9 +173,9 @@ string CSVErrorTypeToEnum(CSVErrorType type) { void CSVGlobalState::FillRejectsTable() { auto &options = bind_data.options; - if (options.store_rejects) { + if (options.store_rejects.GetValue()) { auto limit = options.rejects_limit; - auto rejects = CSVRejectsTable::GetOrCreate(context); + auto rejects = CSVRejectsTable::GetOrCreate(context, options.rejects_table_name); lock_guard lock(rejects->write_lock); auto &table = rejects->GetTable(context); InternalAppender appender(context, table); diff --git a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp index 849c0e97ec52..7fcb7f3383b1 100644 --- a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +++ b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp @@ -208,6 +208,13 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, allow_quoted_nulls = ParseBoolean(value, loption); } else if (loption == "store_rejects") { store_rejects = ParseBoolean(value, loption); + } else if (loption == "rejects_table") { + // skip, handled in SetRejectsOptions + auto table_name = ParseString(value, loption); + if (table_name.empty()) { + throw BinderException("REJECTS_TABLE option cannot be empty"); + } + rejects_table_name = table_name; } else if (loption == "rejects_limit") { int64_t limit = ParseInteger(value, loption); if (limit < 0) { diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index 939ce125f39a..7d8094659377 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -9,14 +9,14 @@ namespace duckdb { TableCatalogEntry &CSVRejectsTable::GetTable(ClientContext &context) { auto &temp_catalog = Catalog::GetCatalog(context, TEMP_CATALOG); - auto &table_entry = temp_catalog.GetEntry(context, TEMP_CATALOG, DEFAULT_SCHEMA, "reject_scans"); + auto &table_entry = temp_catalog.GetEntry(context, TEMP_CATALOG, DEFAULT_SCHEMA, errors_table); return table_entry; } -shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context) { - auto key = "CSV_REJECTS_TABLE_CACHE_ENTRY"; +shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, const string &name) { + auto key = "CSV_REJECTS_TABLE_CACHE_ENTRY_" + StringUtil::Upper(name); auto &cache = ObjectCache::GetObjectCache(context); - return cache.GetOrCreate(key); + return cache.GetOrCreate(key, name); } void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData &data) { @@ -40,7 +40,7 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData // Create Rejects Scans Table { - auto info = make_uniq(TEMP_CATALOG, DEFAULT_SCHEMA, "reject_scans"); + auto info = make_uniq(TEMP_CATALOG, DEFAULT_SCHEMA, scan_table); info->temporary = true; info->on_conflict = OnCreateConflict::ERROR_ON_CONFLICT; // 0. Scan ID @@ -73,7 +73,7 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData } { // Create Rejects Error Table - auto info = make_uniq(TEMP_CATALOG, DEFAULT_SCHEMA, "reject_errors"); + auto info = make_uniq(TEMP_CATALOG, DEFAULT_SCHEMA, errors_table); info->temporary = true; info->on_conflict = OnCreateConflict::ERROR_ON_CONFLICT; // 1. Row Line diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index 2c691aa8bd21..b7f865fc718e 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -51,11 +51,18 @@ static unique_ptr ReadCSVBind(ClientContext &context, TableFunctio result->files = MultiFileReader::GetFileList(context, input.inputs[0], "CSV"); options.FromNamedParameters(input.named_parameters, context, return_types, names); - + if (!options.rejects_table_name.empty() && !options.store_rejects.GetValue() && + options.store_rejects.IsSetByUser()) { + throw BinderException( + "rejects_table_name option is only supported when store_rejects is not manually set to false"); + } + // Ensure we set ignore errors to true automagically + options.store_rejects.Set(true, false); // Validate rejects_table options - if (options.store_rejects) { + if (options.store_rejects.GetValue()) { if (!options.ignore_errors.GetValue() && options.ignore_errors.IsSetByUser()) { - throw BinderException("REJECTS_TABLE option is only supported when IGNORE_ERRORS is set to true"); + throw BinderException( + "STORE_REJECTS option is only supported when IGNORE_ERRORS is not manually set to false"); } // Ensure we set ignore errors to true automagically options.ignore_errors.Set(true, false); @@ -63,7 +70,7 @@ static unique_ptr ReadCSVBind(ClientContext &context, TableFunctio throw BinderException("REJECTS_TABLE option is not supported when UNION_BY_NAME is set to true"); } } - if (options.rejects_limit != 0 && !options.store_rejects) { + if (options.rejects_limit != 0 && !options.store_rejects.GetValue()) { throw BinderException("REJECTS_LIMIT option is only supported when REJECTS_TABLE is set to a table name"); } @@ -145,8 +152,9 @@ static unique_ptr ReadCSVInitGlobal(ClientContext &con auto &bind_data = input.bind_data->Cast(); // Create the temporary rejects table - if (bind_data.options.store_rejects) { - CSVRejectsTable::GetOrCreate(context)->InitializeTable(context, bind_data); + if (bind_data.options.store_rejects.GetValue()) { + CSVRejectsTable::GetOrCreate(context, bind_data.options.rejects_table_name) + ->InitializeTable(context, bind_data); } if (bind_data.files.empty()) { // This can happen when a filename based filter pushdown has eliminated all possible files for this scan. @@ -227,6 +235,7 @@ void ReadCSVTableFunction::ReadCSVAddNamedParameters(TableFunction &table_functi table_function.named_parameters["maximum_line_size"] = LogicalType::VARCHAR; table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN; table_function.named_parameters["store_rejects"] = LogicalType::BOOLEAN; + table_function.named_parameters["rejects_table"] = LogicalType::VARCHAR; table_function.named_parameters["rejects_limit"] = LogicalType::BIGINT; table_function.named_parameters["buffer_size"] = LogicalType::UBIGINT; table_function.named_parameters["decimal_separator"] = LogicalType::VARCHAR; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp index 436802909c82..a7db5aeb06f4 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp @@ -41,8 +41,10 @@ struct CSVReaderOptions { DialectOptions dialect_options; //! Whether or not we should ignore InvalidInput errors CSVOption ignore_errors = false; - //! Whether we store CSV Errors or not - bool store_rejects = false; + //! Whether we store CSV Errors in the rejects table or not + CSVOption store_rejects = false; + //! Rejects table name + string rejects_table_name; //! Rejects table entry limit (0 = no limit) idx_t rejects_limit = 0; //! Number of samples to buffer diff --git a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp index bb4ff62fa4ae..f88eff8028ea 100644 --- a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp +++ b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp @@ -14,14 +14,23 @@ class ClientContext; class CSVRejectsTable : public ObjectCacheEntry { public: - CSVRejectsTable() : count(0) { + CSVRejectsTable(string name) : name(name), count(0) { + if (name.empty()) { + scan_table = "reject_scan"; + errors_table = "reject_errors"; + } else { + scan_table = name + "_scan"; + errors_table = name; + } } ~CSVRejectsTable() override = default; mutex write_lock; - + string name; idx_t count; + string scan_table; + string errors_table; - static shared_ptr GetOrCreate(ClientContext &context); + static shared_ptr GetOrCreate(ClientContext &context, const string &name); void InitializeTable(ClientContext &context, const ReadCSVData &options); TableCatalogEntry &GetTable(ClientContext &context); diff --git a/src/include/duckdb/storage/serialization/nodes.json b/src/include/duckdb/storage/serialization/nodes.json index 6dfb0b003f15..39961131cad2 100644 --- a/src/include/duckdb/storage/serialization/nodes.json +++ b/src/include/duckdb/storage/serialization/nodes.json @@ -605,7 +605,7 @@ }, {"id": 117, "name": "store_rejects", - "type": "bool" + "type": "CSVOption" }, {"id": 118, "name": "rejects_limit", @@ -650,6 +650,10 @@ {"id": 128, "name": "parallel", "type": "bool" + }, + {"id": 129, + "name": "rejects_table_name", + "type": "string" } ], "pointer_type": "none" diff --git a/src/storage/serialization/serialize_nodes.cpp b/src/storage/serialization/serialize_nodes.cpp index c274e2a2b2ac..b7f0d3078810 100644 --- a/src/storage/serialization/serialize_nodes.cpp +++ b/src/storage/serialization/serialize_nodes.cpp @@ -135,7 +135,7 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault(114, "buffer_size", buffer_size); serializer.WriteProperty(115, "file_options", file_options); serializer.WritePropertyWithDefault>(116, "force_quote", force_quote); - serializer.WritePropertyWithDefault(117, "store_rejects", store_rejects); + serializer.WriteProperty>(117, "store_rejects", store_rejects); serializer.WritePropertyWithDefault(118, "rejects_limit", rejects_limit); serializer.WriteProperty>(119, "dialect_options.state_machine_options.delimiter", dialect_options.state_machine_options.delimiter); serializer.WriteProperty>(120, "dialect_options.state_machine_options.quote", dialect_options.state_machine_options.quote); @@ -147,6 +147,7 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WriteProperty>>(126, "dialect_options.date_format", dialect_options.date_format); serializer.WritePropertyWithDefault(127, "sniffer_user_mismatch_error", sniffer_user_mismatch_error); serializer.WritePropertyWithDefault(128, "parallel", parallel); + serializer.WritePropertyWithDefault(129, "rejects_table_name", rejects_table_name); } CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { @@ -168,7 +169,7 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { deserializer.ReadPropertyWithDefault(114, "buffer_size", result.buffer_size); deserializer.ReadProperty(115, "file_options", result.file_options); deserializer.ReadPropertyWithDefault>(116, "force_quote", result.force_quote); - deserializer.ReadPropertyWithDefault(117, "store_rejects", result.store_rejects); + deserializer.ReadProperty>(117, "store_rejects", result.store_rejects); deserializer.ReadPropertyWithDefault(118, "rejects_limit", result.rejects_limit); deserializer.ReadProperty>(119, "dialect_options.state_machine_options.delimiter", result.dialect_options.state_machine_options.delimiter); deserializer.ReadProperty>(120, "dialect_options.state_machine_options.quote", result.dialect_options.state_machine_options.quote); @@ -180,6 +181,7 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { deserializer.ReadProperty>>(126, "dialect_options.date_format", result.dialect_options.date_format); deserializer.ReadPropertyWithDefault(127, "sniffer_user_mismatch_error", result.sniffer_user_mismatch_error); deserializer.ReadPropertyWithDefault(128, "parallel", result.parallel); + deserializer.ReadPropertyWithDefault(129, "rejects_table_name", result.rejects_table_name); return result; } From 779ab7f36f9088d54e1f1662ac03daded8d3ed26 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 11 Mar 2024 15:51:26 +0100 Subject: [PATCH 038/147] got rejects table right --- .../table_function/global_csv_state.cpp | 53 +++++++++++-------- .../operator/persistent/csv_rejects_table.cpp | 52 +++++++++++------- .../operator/persistent/csv_rejects_table.hpp | 3 +- 3 files changed, 64 insertions(+), 44 deletions(-) diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 00ba18b82f00..917a581bcbcc 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -177,11 +177,16 @@ void CSVGlobalState::FillRejectsTable() { auto limit = options.rejects_limit; auto rejects = CSVRejectsTable::GetOrCreate(context, options.rejects_table_name); lock_guard lock(rejects->write_lock); - auto &table = rejects->GetTable(context); - InternalAppender appender(context, table); + auto &errors_table = rejects->GetErrorsTable(context); + auto &scans_table = rejects->GetScansTable(context); + InternalAppender errors_appender(context, errors_table); + InternalAppender scans_appender(context, scans_table); + idx_t scan_id = context.transaction.GetActiveQuery(); + idx_t file_id = 0; for (auto &file : file_scans) { auto file_name = file->file_path; auto &errors = file->error_handler->errors; + // We first insert the file into the file scans table for (auto &error_vector : errors) { for (auto &error : error_vector.second) { if (!IsCSVErrorAcceptedReject(error.type)) { @@ -197,36 +202,38 @@ void CSVGlobalState::FillRejectsTable() { auto row_line = file->error_handler->GetLine(error.error_info); auto col_idx = error.column_idx; // Add the row to the rejects table - appender.BeginRow(); - // 1. File Path - appender.Append(string_t(file_name)); - // 2. Row Line - appender.Append(row_line); - // 3. Byte Position where error occurred - appender.Append(error.byte_position); - // 4. Column Index - appender.Append(col_idx + 1); - // 5. Column Name (If Applicable) + errors_appender.BeginRow(); + // 1. Scan Id + errors_appender.Append(scan_id); + // 2. File Id + errors_appender.Append(file_id); + // 3. Row Line + errors_appender.Append(row_line); + // 4. Byte Position where error occurred + errors_appender.Append(error.byte_position); + // 5. Column Index + errors_appender.Append(col_idx + 1); + // 6. Column Name (If Applicable) switch (error.type) { case CSVErrorType::TOO_MANY_COLUMNS: - appender.Append(Value()); + errors_appender.Append(Value()); break; case CSVErrorType::TOO_FEW_COLUMNS: D_ASSERT(bind_data.return_names.size() > col_idx + 1); - appender.Append(string_t("\"" + bind_data.return_names[col_idx + 1] + "\"")); + errors_appender.Append(string_t("\"" + bind_data.return_names[col_idx + 1] + "\"")); break; default: - appender.Append(string_t("\"" + bind_data.return_names[col_idx] + "\"")); + errors_appender.Append(string_t("\"" + bind_data.return_names[col_idx] + "\"")); } - // 6. Error Type - appender.Append(string_t(CSVErrorTypeToEnum(error.type))); - // 7. Original CSV Line - appender.Append(string_t(error.csv_row)); - // 8. Full Error Message - appender.Append(string_t(error.error_message)); - appender.EndRow(); + // 7. Error Type + errors_appender.Append(string_t(CSVErrorTypeToEnum(error.type))); + // 8. Original CSV Line + errors_appender.Append(string_t(error.csv_row)); + // 9. Full Error Message + errors_appender.Append(string_t(error.error_message)); + errors_appender.EndRow(); } - appender.Close(); + errors_appender.Close(); } } } diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index 7d8094659377..50ddcedc7b54 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -7,12 +7,18 @@ namespace duckdb { -TableCatalogEntry &CSVRejectsTable::GetTable(ClientContext &context) { +TableCatalogEntry &CSVRejectsTable::GetErrorsTable(ClientContext &context) { auto &temp_catalog = Catalog::GetCatalog(context, TEMP_CATALOG); auto &table_entry = temp_catalog.GetEntry(context, TEMP_CATALOG, DEFAULT_SCHEMA, errors_table); return table_entry; } +TableCatalogEntry &CSVRejectsTable::GetScansTable(ClientContext &context) { + auto &temp_catalog = Catalog::GetCatalog(context, TEMP_CATALOG); + auto &table_entry = temp_catalog.GetEntry(context, TEMP_CATALOG, DEFAULT_SCHEMA, scan_table); + return table_entry; +} + shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, const string &name) { auto key = "CSV_REJECTS_TABLE_CACHE_ENTRY_" + StringUtil::Upper(name); auto &cache = ObjectCache::GetObjectCache(context); @@ -45,29 +51,31 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData info->on_conflict = OnCreateConflict::ERROR_ON_CONFLICT; // 0. Scan ID info->columns.AddColumn(ColumnDefinition("scan_id", LogicalType::UBIGINT)); - // 1. File Path + // 1. File ID (within the scan) + info->columns.AddColumn(ColumnDefinition("file_id", LogicalType::UBIGINT)); + // 2. File Path info->columns.AddColumn(ColumnDefinition("file_path", LogicalType::VARCHAR)); - // 2. Delimiter + // 3. Delimiter info->columns.AddColumn(ColumnDefinition("delimiter", LogicalType::VARCHAR)); - // 3. Quote + // 4. Quote info->columns.AddColumn(ColumnDefinition("quote", LogicalType::VARCHAR)); - // 4. Escape + // 5. Escape info->columns.AddColumn(ColumnDefinition("escape", LogicalType::VARCHAR)); - // 5. NewLine Delimiter + // 6. NewLine Delimiter info->columns.AddColumn(ColumnDefinition("newline_delimiter", LogicalType::VARCHAR)); - // 6. Skip Rows + // 7. Skip Rows info->columns.AddColumn(ColumnDefinition("skip_rows", LogicalType::UINTEGER)); - // 7. Has Header + // 8. Has Header info->columns.AddColumn(ColumnDefinition("has_header", LogicalType::BOOLEAN)); - // 8. List> + // 9. List> info->columns.AddColumn(ColumnDefinition("columns", LogicalType::VARCHAR)); - // 9. Date Format + // 10. Date Format info->columns.AddColumn(ColumnDefinition("date_format", LogicalType::VARCHAR)); - // 10. Timestamp Format + // 11. Timestamp Format info->columns.AddColumn(ColumnDefinition("timestamp_format", LogicalType::VARCHAR)); - // 11. CSV read function with all the options used - info->columns.AddColumn(ColumnDefinition("user_arguments", LogicalType::VARCHAR)); // 12. CSV read function with all the options used + info->columns.AddColumn(ColumnDefinition("user_arguments", LogicalType::VARCHAR)); + // 13. CSV read function with all the options used info->columns.AddColumn(ColumnDefinition("prompt", LogicalType::VARCHAR)); catalog.CreateTable(context, std::move(info)); } @@ -76,19 +84,23 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData auto info = make_uniq(TEMP_CATALOG, DEFAULT_SCHEMA, errors_table); info->temporary = true; info->on_conflict = OnCreateConflict::ERROR_ON_CONFLICT; - // 1. Row Line + // 0. Scan ID + info->columns.AddColumn(ColumnDefinition("scan_id", LogicalType::UBIGINT)); + // 1. File ID (within the scan) + info->columns.AddColumn(ColumnDefinition("file_id", LogicalType::UBIGINT)); + // 2. Row Line info->columns.AddColumn(ColumnDefinition("line", LogicalType::UBIGINT)); - // 2. Byte Position where error occurred + // 3. Byte Position where error occurred info->columns.AddColumn(ColumnDefinition("byte_position", LogicalType::UBIGINT)); - // 3. Column Index (If Applicable) + // 4. Column Index (If Applicable) info->columns.AddColumn(ColumnDefinition("column_idx", LogicalType::UBIGINT)); - // 4. Column Name (If Applicable) + // 5. Column Name (If Applicable) info->columns.AddColumn(ColumnDefinition("column_name", LogicalType::VARCHAR)); - // 5. Error Type + // 6. Error Type info->columns.AddColumn(ColumnDefinition("error_type", enum_type)); - // 6. Original CSV Line + // 7. Original CSV Line info->columns.AddColumn(ColumnDefinition("csv_line", LogicalType::VARCHAR)); - // 7. Full Error Message + // 8. Full Error Message info->columns.AddColumn(ColumnDefinition("error_message", LogicalType::VARCHAR)); catalog.CreateTable(context, std::move(info)); } diff --git a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp index f88eff8028ea..2a17f0b61851 100644 --- a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp +++ b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp @@ -33,7 +33,8 @@ class CSVRejectsTable : public ObjectCacheEntry { static shared_ptr GetOrCreate(ClientContext &context, const string &name); void InitializeTable(ClientContext &context, const ReadCSVData &options); - TableCatalogEntry &GetTable(ClientContext &context); + TableCatalogEntry &GetErrorsTable(ClientContext &context); + TableCatalogEntry &GetScansTable(ClientContext &context); public: static string ObjectType() { From 77bfe80f6124e7a950f8b95363e19e7002e84480 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 11 Mar 2024 16:25:31 +0100 Subject: [PATCH 039/147] First version of rejects_scans and reject_errors tables --- .../table_function/global_csv_state.cpp | 76 +++++++++++++++++-- .../operator/persistent/csv_rejects_table.cpp | 2 - src/function/table/sniff_csv.cpp | 14 +--- .../csv_scanner/csv_reader_options.hpp | 11 +++ .../operator/persistent/csv_rejects_table.hpp | 2 +- .../csv/rejects/csv_rejects_double_table.test | 34 +++++++++ 6 files changed, 117 insertions(+), 22 deletions(-) diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 917a581bcbcc..b1fef47e4d1b 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -170,6 +170,65 @@ string CSVErrorTypeToEnum(CSVErrorType type) { } } +void FillScanErrorTable(InternalAppender &scan_appender, idx_t scan_idx, idx_t file_idx, CSVFileScan &file) { + CSVReaderOptions &options = file.options; + // Add the row to the rejects table + scan_appender.BeginRow(); + // 1. Scan Idx + scan_appender.Append(scan_idx); + // 2. File Idx + scan_appender.Append(file_idx); + // 3. File Path + scan_appender.Append(string_t(file.file_path)); + // 4. Delimiter + scan_appender.Append(string_t(options.dialect_options.state_machine_options.delimiter.FormatValue())); + // 5. Quote + scan_appender.Append(string_t(options.dialect_options.state_machine_options.quote.FormatValue())); + // 6. Escape + scan_appender.Append(string_t(options.dialect_options.state_machine_options.escape.FormatValue())); + // 7. NewLine Delimiter + scan_appender.Append(string_t(options.NewLineIdentifierToString())); + // 8. Skip Rows + scan_appender.Append(Value::UINTEGER(NumericCast(options.dialect_options.skip_rows.GetValue()))); + // 9. Has Header + scan_appender.Append(Value::BOOLEAN(options.dialect_options.header.GetValue())); + // 10. List> {'col1': 'INTEGER', 'col2': 'VARCHAR'} + std::ostringstream columns; + columns << "{"; + for (idx_t i = 0; i < file.types.size(); i++) { + columns << "'" << file.names[i] << "': '" << file.types[i].ToString() << "'"; + if (i != file.types.size() - 1) { + columns << ","; + } + } + columns << "}"; + scan_appender.Append(string_t(columns.str())); + // 11. Date Format + auto date_format = options.dialect_options.date_format[LogicalType::DATE].GetValue(); + if (!date_format.Empty()) { + scan_appender.Append(string_t(date_format.format_specifier)); + } else { + scan_appender.Append(Value()); + } + + // 12. Timestamp Format + auto timestamp_format = options.dialect_options.date_format[LogicalType::TIMESTAMP].GetValue(); + if (!timestamp_format.Empty()) { + scan_appender.Append(string_t(timestamp_format.format_specifier)); + } else { + scan_appender.Append(Value()); + } + + // 13. The Extra User Arguments + if (options.user_defined_parameters.empty()) { + scan_appender.Append(Value()); + } else { + scan_appender.Append(string_t(options.user_defined_parameters)); + } + // Finish the row to the rejects table + scan_appender.EndRow(); +} + void CSVGlobalState::FillRejectsTable() { auto &options = bind_data.options; @@ -181,8 +240,8 @@ void CSVGlobalState::FillRejectsTable() { auto &scans_table = rejects->GetScansTable(context); InternalAppender errors_appender(context, errors_table); InternalAppender scans_appender(context, scans_table); - idx_t scan_id = context.transaction.GetActiveQuery(); - idx_t file_id = 0; + idx_t scan_idx = context.transaction.GetActiveQuery(); + idx_t file_idx = 0; for (auto &file : file_scans) { auto file_name = file->file_path; auto &errors = file->error_handler->errors; @@ -190,7 +249,6 @@ void CSVGlobalState::FillRejectsTable() { for (auto &error_vector : errors) { for (auto &error : error_vector.second) { if (!IsCSVErrorAcceptedReject(error.type)) { - // For now, we only will use it for casting errors continue; } // short circuit if we already have too many rejects @@ -204,9 +262,9 @@ void CSVGlobalState::FillRejectsTable() { // Add the row to the rejects table errors_appender.BeginRow(); // 1. Scan Id - errors_appender.Append(scan_id); + errors_appender.Append(scan_idx); // 2. File Id - errors_appender.Append(file_id); + errors_appender.Append(file_idx); // 3. Row Line errors_appender.Append(row_line); // 4. Byte Position where error occurred @@ -233,10 +291,16 @@ void CSVGlobalState::FillRejectsTable() { errors_appender.Append(string_t(error.error_message)); errors_appender.EndRow(); } - errors_appender.Close(); } } + if (rejects->count != 0) { + rejects->count = 0; + FillScanErrorTable(scans_appender, scan_idx, file_idx, *file); + } + file_idx++; } + errors_appender.Close(); + scans_appender.Close(); } } diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index 50ddcedc7b54..e74a7806a4ef 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -75,8 +75,6 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData info->columns.AddColumn(ColumnDefinition("timestamp_format", LogicalType::VARCHAR)); // 12. CSV read function with all the options used info->columns.AddColumn(ColumnDefinition("user_arguments", LogicalType::VARCHAR)); - // 13. CSV read function with all the options used - info->columns.AddColumn(ColumnDefinition("prompt", LogicalType::VARCHAR)); catalog.CreateTable(context, std::move(info)); } { diff --git a/src/function/table/sniff_csv.cpp b/src/function/table/sniff_csv.cpp index f135b15c615d..28f248d4f459 100644 --- a/src/function/table/sniff_csv.cpp +++ b/src/function/table/sniff_csv.cpp @@ -83,17 +83,6 @@ static unique_ptr CSVSniffBind(ClientContext &context, TableFuncti return std::move(result); } -string NewLineIdentifierToString(NewLineIdentifier identifier) { - switch (identifier) { - case NewLineIdentifier::SINGLE: - return "\\n"; - case NewLineIdentifier::CARRY_ON: - return "\\r\\n"; - default: - return ""; - } -} - string FormatOptions(char opt) { if (opt == '\'') { return "''"; @@ -138,8 +127,7 @@ static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p, str_opt = sniffer_options.dialect_options.state_machine_options.escape.GetValue(); output.SetValue(2, 0, str_opt); // 4. NewLine Delimiter - auto new_line_identifier = - NewLineIdentifierToString(sniffer_options.dialect_options.state_machine_options.new_line.GetValue()); + auto new_line_identifier = sniffer_options.NewLineIdentifierToString(); output.SetValue(3, 0, new_line_identifier); // 5. Skip Rows output.SetValue(4, 0, Value::UINTEGER(NumericCast(sniffer_options.dialect_options.skip_rows.GetValue()))); diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp index a7db5aeb06f4..faabfe62f23e 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp @@ -157,5 +157,16 @@ struct CSVReaderOptions { vector &names); string ToString() const; + + string NewLineIdentifierToString() { + switch (dialect_options.state_machine_options.new_line.GetValue()) { + case NewLineIdentifier::SINGLE: + return "\\n"; + case NewLineIdentifier::CARRY_ON: + return "\\r\\n"; + default: + return ""; + } + } }; } // namespace duckdb diff --git a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp index 2a17f0b61851..6254d7cecc01 100644 --- a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp +++ b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp @@ -16,7 +16,7 @@ class CSVRejectsTable : public ObjectCacheEntry { public: CSVRejectsTable(string name) : name(name), count(0) { if (name.empty()) { - scan_table = "reject_scan"; + scan_table = "reject_scans"; errors_table = "reject_errors"; } else { scan_table = name + "_scan"; diff --git a/test/sql/copy/csv/rejects/csv_rejects_double_table.test b/test/sql/copy/csv/rejects/csv_rejects_double_table.test index e69de29bb2d1..1d82bc77e2b8 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_double_table.test +++ b/test/sql/copy/csv/rejects/csv_rejects_double_table.test @@ -0,0 +1,34 @@ +# name: test/sql/copy/csv/rejects/csv_rejects_double_table.test +# group: [rejects] + +require skip_reload + +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + +# Ensure that we can get the schema if we reduce the sample size and ignore errors +query IIIII +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + store_rejects=true, + ignore_errors=true); +---- +BIGINT VARCHAR 11044 11044 2 + + +query IIIIIIIIIIIII +SELECT * +FROM reject_scans order by all; +---- +3 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL ignore_errors=true, store_rejects=true, sample_size=1 +3 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL ignore_errors=true, store_rejects=true, sample_size=1 + +query IIIIIIIII +SELECT * +FROM reject_errors order by all; +---- +3 0 2176 10875 1 "column0" CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +3 0 4176 20875 1 "column0" CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +3 1 3680 18395 1 "column0" CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +3 1 5680 28395 1 "column0" CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' \ No newline at end of file From b6479d951ff573130543153374669038716e3c1c Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 11 Mar 2024 17:20:51 +0100 Subject: [PATCH 040/147] More adjustments --- .../table_function/global_csv_state.cpp | 4 ++-- .../csv/rejects/csv_rejects_double_table.test | 15 +++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index b1fef47e4d1b..e59d12963348 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -278,10 +278,10 @@ void CSVGlobalState::FillRejectsTable() { break; case CSVErrorType::TOO_FEW_COLUMNS: D_ASSERT(bind_data.return_names.size() > col_idx + 1); - errors_appender.Append(string_t("\"" + bind_data.return_names[col_idx + 1] + "\"")); + errors_appender.Append(string_t(bind_data.return_names[col_idx + 1])); break; default: - errors_appender.Append(string_t("\"" + bind_data.return_names[col_idx] + "\"")); + errors_appender.Append(string_t(bind_data.return_names[col_idx])); } // 7. Error Type errors_appender.Append(string_t(CSVErrorTypeToEnum(error.type))); diff --git a/test/sql/copy/csv/rejects/csv_rejects_double_table.test b/test/sql/copy/csv/rejects/csv_rejects_double_table.test index 1d82bc77e2b8..d2714bd3be55 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_double_table.test +++ b/test/sql/copy/csv/rejects/csv_rejects_double_table.test @@ -11,8 +11,7 @@ query IIIII SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', sample_size=1, - store_rejects=true, - ignore_errors=true); + store_rejects=true); ---- BIGINT VARCHAR 11044 11044 2 @@ -21,14 +20,14 @@ query IIIIIIIIIIIII SELECT * FROM reject_scans order by all; ---- -3 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL ignore_errors=true, store_rejects=true, sample_size=1 -3 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL ignore_errors=true, store_rejects=true, sample_size=1 +3 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL store_rejects=true, sample_size=1 +3 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL store_rejects=true, sample_size=1 query IIIIIIIII SELECT * FROM reject_errors order by all; ---- -3 0 2176 10875 1 "column0" CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -3 0 4176 20875 1 "column0" CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -3 1 3680 18395 1 "column0" CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -3 1 5680 28395 1 "column0" CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' \ No newline at end of file +3 0 2176 10875 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +3 0 4176 20875 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +3 1 3680 18395 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +3 1 5680 28395 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' \ No newline at end of file From 34db0eee2f02b69e9d518bbeb9f7482593898c54 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 12 Mar 2024 15:03:42 +0100 Subject: [PATCH 041/147] Alright lets have different options for different tables --- .../table_function/global_csv_state.cpp | 3 ++- .../csv_scanner/util/csv_reader_options.cpp | 9 ++++++++- .../operator/persistent/csv_rejects_table.cpp | 8 +++++--- src/function/table/read_csv.cpp | 17 +++++++++++++---- .../operator/csv_scanner/csv_reader_options.hpp | 6 ++++-- .../operator/persistent/csv_rejects_table.hpp | 13 ++++--------- .../duckdb/storage/serialization/nodes.json | 6 +++++- src/storage/serialization/serialize_nodes.cpp | 6 ++++-- 8 files changed, 45 insertions(+), 23 deletions(-) diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index e59d12963348..ae2fddf9df0b 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -234,7 +234,8 @@ void CSVGlobalState::FillRejectsTable() { if (options.store_rejects.GetValue()) { auto limit = options.rejects_limit; - auto rejects = CSVRejectsTable::GetOrCreate(context, options.rejects_table_name); + auto rejects = CSVRejectsTable::GetOrCreate(context, options.rejects_scan_name.GetValue(), + options.rejects_table_name.GetValue()); lock_guard lock(rejects->write_lock); auto &errors_table = rejects->GetErrorsTable(context); auto &scans_table = rejects->GetScansTable(context); diff --git a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp index 7fcb7f3383b1..b06f58779328 100644 --- a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +++ b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp @@ -214,7 +214,14 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, if (table_name.empty()) { throw BinderException("REJECTS_TABLE option cannot be empty"); } - rejects_table_name = table_name; + rejects_table_name.Set(table_name); + } else if (loption == "rejects_scan") { + // skip, handled in SetRejectsOptions + auto table_name = ParseString(value, loption); + if (table_name.empty()) { + throw BinderException("rejects_scan option cannot be empty"); + } + rejects_scan_name.Set(table_name); } else if (loption == "rejects_limit") { int64_t limit = ParseInteger(value, loption); if (limit < 0) { diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index e74a7806a4ef..f9672135e4e9 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -19,10 +19,12 @@ TableCatalogEntry &CSVRejectsTable::GetScansTable(ClientContext &context) { return table_entry; } -shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, const string &name) { - auto key = "CSV_REJECTS_TABLE_CACHE_ENTRY_" + StringUtil::Upper(name); +shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, const string &rejects_scan, + const string &rejects_error) { + auto key = + "CSV_REJECTS_TABLE_CACHE_ENTRY_" + StringUtil::Upper(rejects_scan) + "_" + StringUtil::Upper(rejects_error); auto &cache = ObjectCache::GetObjectCache(context); - return cache.GetOrCreate(key, name); + return cache.GetOrCreate(key, rejects_scan, rejects_error); } void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData &data) { diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index b7f865fc718e..258d69d3871a 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -51,13 +51,20 @@ static unique_ptr ReadCSVBind(ClientContext &context, TableFunctio result->files = MultiFileReader::GetFileList(context, input.inputs[0], "CSV"); options.FromNamedParameters(input.named_parameters, context, return_types, names); - if (!options.rejects_table_name.empty() && !options.store_rejects.GetValue() && + if (options.rejects_table_name.IsSetByUser() && !options.store_rejects.GetValue() && options.store_rejects.IsSetByUser()) { throw BinderException( "rejects_table_name option is only supported when store_rejects is not manually set to false"); } - // Ensure we set ignore errors to true automagically - options.store_rejects.Set(true, false); + if (options.rejects_scan_name.IsSetByUser() && !options.store_rejects.GetValue() && + options.store_rejects.IsSetByUser()) { + throw BinderException( + "rejects_scan_name option is only supported when store_rejects is not manually set to false"); + } + if (options.rejects_scan_name.IsSetByUser() || options.rejects_table_name.IsSetByUser()) { + // Ensure we set store_rejects to true automagically + options.store_rejects.Set(true, false); + } // Validate rejects_table options if (options.store_rejects.GetValue()) { if (!options.ignore_errors.GetValue() && options.ignore_errors.IsSetByUser()) { @@ -153,7 +160,8 @@ static unique_ptr ReadCSVInitGlobal(ClientContext &con // Create the temporary rejects table if (bind_data.options.store_rejects.GetValue()) { - CSVRejectsTable::GetOrCreate(context, bind_data.options.rejects_table_name) + CSVRejectsTable::GetOrCreate(context, bind_data.options.rejects_scan_name.GetValue(), + bind_data.options.rejects_table_name.GetValue()) ->InitializeTable(context, bind_data); } if (bind_data.files.empty()) { @@ -236,6 +244,7 @@ void ReadCSVTableFunction::ReadCSVAddNamedParameters(TableFunction &table_functi table_function.named_parameters["ignore_errors"] = LogicalType::BOOLEAN; table_function.named_parameters["store_rejects"] = LogicalType::BOOLEAN; table_function.named_parameters["rejects_table"] = LogicalType::VARCHAR; + table_function.named_parameters["rejects_scan"] = LogicalType::VARCHAR; table_function.named_parameters["rejects_limit"] = LogicalType::BIGINT; table_function.named_parameters["buffer_size"] = LogicalType::UBIGINT; table_function.named_parameters["decimal_separator"] = LogicalType::VARCHAR; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp index faabfe62f23e..4b69d9aad222 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp @@ -43,8 +43,10 @@ struct CSVReaderOptions { CSVOption ignore_errors = false; //! Whether we store CSV Errors in the rejects table or not CSVOption store_rejects = false; - //! Rejects table name - string rejects_table_name; + //! Rejects table name (Name of the table the store rejects errors) + CSVOption rejects_table_name = {"reject_errors"}; + //! Rejects Scan name name (Name of the table the store rejects scans) + CSVOption rejects_scan_name = {"reject_scans"}; //! Rejects table entry limit (0 = no limit) idx_t rejects_limit = 0; //! Number of samples to buffer diff --git a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp index 6254d7cecc01..ee1d2092660b 100644 --- a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp +++ b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp @@ -14,14 +14,8 @@ class ClientContext; class CSVRejectsTable : public ObjectCacheEntry { public: - CSVRejectsTable(string name) : name(name), count(0) { - if (name.empty()) { - scan_table = "reject_scans"; - errors_table = "reject_errors"; - } else { - scan_table = name + "_scan"; - errors_table = name; - } + CSVRejectsTable(string rejects_scan, string rejects_error) + : count(0), scan_table(rejects_scan), errors_table(rejects_error) { } ~CSVRejectsTable() override = default; mutex write_lock; @@ -30,7 +24,8 @@ class CSVRejectsTable : public ObjectCacheEntry { string scan_table; string errors_table; - static shared_ptr GetOrCreate(ClientContext &context, const string &name); + static shared_ptr GetOrCreate(ClientContext &context, const string &rejects_scan, + const string &rejects_error); void InitializeTable(ClientContext &context, const ReadCSVData &options); TableCatalogEntry &GetErrorsTable(ClientContext &context); diff --git a/src/include/duckdb/storage/serialization/nodes.json b/src/include/duckdb/storage/serialization/nodes.json index 39961131cad2..e3bec298114c 100644 --- a/src/include/duckdb/storage/serialization/nodes.json +++ b/src/include/duckdb/storage/serialization/nodes.json @@ -653,7 +653,11 @@ }, {"id": 129, "name": "rejects_table_name", - "type": "string" + "type": "CSVOption" + }, + {"id": 130, + "name": "rejects_scan_name", + "type": "CSVOption" } ], "pointer_type": "none" diff --git a/src/storage/serialization/serialize_nodes.cpp b/src/storage/serialization/serialize_nodes.cpp index b7f0d3078810..bc9ee449a8d5 100644 --- a/src/storage/serialization/serialize_nodes.cpp +++ b/src/storage/serialization/serialize_nodes.cpp @@ -147,7 +147,8 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WriteProperty>>(126, "dialect_options.date_format", dialect_options.date_format); serializer.WritePropertyWithDefault(127, "sniffer_user_mismatch_error", sniffer_user_mismatch_error); serializer.WritePropertyWithDefault(128, "parallel", parallel); - serializer.WritePropertyWithDefault(129, "rejects_table_name", rejects_table_name); + serializer.WriteProperty>(129, "rejects_table_name", rejects_table_name); + serializer.WriteProperty>(130, "rejects_scan_name", rejects_scan_name); } CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { @@ -181,7 +182,8 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { deserializer.ReadProperty>>(126, "dialect_options.date_format", result.dialect_options.date_format); deserializer.ReadPropertyWithDefault(127, "sniffer_user_mismatch_error", result.sniffer_user_mismatch_error); deserializer.ReadPropertyWithDefault(128, "parallel", result.parallel); - deserializer.ReadPropertyWithDefault(129, "rejects_table_name", result.rejects_table_name); + deserializer.ReadProperty>(129, "rejects_table_name", result.rejects_table_name); + deserializer.ReadProperty>(130, "rejects_scan_name", result.rejects_scan_name); return result; } From d877701dd895d2809c029cb1c32232ff43f12017 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 12 Mar 2024 17:43:15 +0100 Subject: [PATCH 042/147] Lots more tests --- src/catalog/catalog.cpp | 11 + .../csv_scanner/util/csv_reader_options.cpp | 2 +- .../operator/persistent/csv_rejects_table.cpp | 18 +- src/function/table/read_csv.cpp | 6 +- src/include/duckdb/catalog/catalog.hpp | 3 + .../csv/rejects/csv_rejects_double_table.test | 33 --- .../csv/rejects/csv_rejects_two_tables.test | 241 ++++++++++++++++++ 7 files changed, 274 insertions(+), 40 deletions(-) delete mode 100644 test/sql/copy/csv/rejects/csv_rejects_double_table.test create mode 100644 test/sql/copy/csv/rejects/csv_rejects_two_tables.test diff --git a/src/catalog/catalog.cpp b/src/catalog/catalog.cpp index 7294427a2942..1674128837f3 100644 --- a/src/catalog/catalog.cpp +++ b/src/catalog/catalog.cpp @@ -758,6 +758,17 @@ CatalogEntry &Catalog::GetEntry(ClientContext &context, const string &schema, co throw CatalogException("CatalogElement \"%s.%s\" does not exist!", schema, name); } +bool Catalog::EntryExists(ClientContext &context, const string &schema, const string &name) { + vector entry_types {CatalogType::TABLE_ENTRY, CatalogType::SEQUENCE_ENTRY}; + for (auto entry_type : entry_types) { + auto result = GetEntry(context, entry_type, schema, name, OnEntryNotFound::RETURN_NULL); + if (result) { + return true; + } + } + return false; +} + optional_ptr Catalog::GetEntry(ClientContext &context, CatalogType type, const string &schema_name, const string &name, OnEntryNotFound if_not_found, QueryErrorContext error_context) { diff --git a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp index b06f58779328..9ea7bb80992d 100644 --- a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +++ b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp @@ -207,7 +207,7 @@ void CSVReaderOptions::SetReadOption(const string &loption, const Value &value, } else if (loption == "allow_quoted_nulls") { allow_quoted_nulls = ParseBoolean(value, loption); } else if (loption == "store_rejects") { - store_rejects = ParseBoolean(value, loption); + store_rejects.Set(ParseBoolean(value, loption)); } else if (loption == "rejects_table") { // skip, handled in SetRejectsOptions auto table_name = ParseString(value, loption); diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index f9672135e4e9..429d385553e2 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -24,6 +24,20 @@ shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, auto key = "CSV_REJECTS_TABLE_CACHE_ENTRY_" + StringUtil::Upper(rejects_scan) + "_" + StringUtil::Upper(rejects_error); auto &cache = ObjectCache::GetObjectCache(context); + auto &catalog = Catalog::GetCatalog(context, TEMP_CATALOG); + bool rejects_scan_exist = catalog.EntryExists(context, DEFAULT_SCHEMA, rejects_scan); + bool rejects_error_exist = catalog.EntryExists(context, DEFAULT_SCHEMA, rejects_error); + if ((rejects_scan_exist || rejects_error_exist) && !cache.Get(key)) { + std::ostringstream error; + if (rejects_scan_exist) { + error << "Reject Scan Table name \"" << rejects_scan << "\" is already in use. "; + } + if (rejects_error_exist) { + error << "Reject Error Table name \"" << rejects_error << "\" is already in use. "; + } + error << "Either drop the used name(s), or give other name options in the CSV Reader function.\n"; + throw BinderException(error.str()); + } return cache.GetOrCreate(key, rejects_scan, rejects_error); } @@ -50,7 +64,7 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData { auto info = make_uniq(TEMP_CATALOG, DEFAULT_SCHEMA, scan_table); info->temporary = true; - info->on_conflict = OnCreateConflict::ERROR_ON_CONFLICT; + info->on_conflict = OnCreateConflict::IGNORE_ON_CONFLICT; // 0. Scan ID info->columns.AddColumn(ColumnDefinition("scan_id", LogicalType::UBIGINT)); // 1. File ID (within the scan) @@ -83,7 +97,7 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData // Create Rejects Error Table auto info = make_uniq(TEMP_CATALOG, DEFAULT_SCHEMA, errors_table); info->temporary = true; - info->on_conflict = OnCreateConflict::ERROR_ON_CONFLICT; + info->on_conflict = OnCreateConflict::IGNORE_ON_CONFLICT; // 0. Scan ID info->columns.AddColumn(ColumnDefinition("scan_id", LogicalType::UBIGINT)); // 1. File ID (within the scan) diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index 258d69d3871a..60d942b3c8a7 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -53,13 +53,11 @@ static unique_ptr ReadCSVBind(ClientContext &context, TableFunctio options.FromNamedParameters(input.named_parameters, context, return_types, names); if (options.rejects_table_name.IsSetByUser() && !options.store_rejects.GetValue() && options.store_rejects.IsSetByUser()) { - throw BinderException( - "rejects_table_name option is only supported when store_rejects is not manually set to false"); + throw BinderException("REJECTS_TABLE option is only supported when store_rejects is not manually set to false"); } if (options.rejects_scan_name.IsSetByUser() && !options.store_rejects.GetValue() && options.store_rejects.IsSetByUser()) { - throw BinderException( - "rejects_scan_name option is only supported when store_rejects is not manually set to false"); + throw BinderException("REJECTS_SCAN option is only supported when store_rejects is not manually set to false"); } if (options.rejects_scan_name.IsSetByUser() || options.rejects_table_name.IsSetByUser()) { // Ensure we set store_rejects to true automagically diff --git a/src/include/duckdb/catalog/catalog.hpp b/src/include/duckdb/catalog/catalog.hpp index 0bbf322c628b..ead5f183c75b 100644 --- a/src/include/duckdb/catalog/catalog.hpp +++ b/src/include/duckdb/catalog/catalog.hpp @@ -228,6 +228,9 @@ class Catalog { //! Gets the "schema.name" entry without a specified type, if entry does not exist an exception is thrown DUCKDB_API CatalogEntry &GetEntry(ClientContext &context, const string &schema, const string &name); + //! Returns true if the "schema.name" entry without a specified type exists + DUCKDB_API bool EntryExists(ClientContext &context, const string &schema, const string &name); + //! Fetches a logical type from the catalog DUCKDB_API LogicalType GetType(ClientContext &context, const string &schema, const string &names, OnEntryNotFound if_not_found); diff --git a/test/sql/copy/csv/rejects/csv_rejects_double_table.test b/test/sql/copy/csv/rejects/csv_rejects_double_table.test deleted file mode 100644 index d2714bd3be55..000000000000 --- a/test/sql/copy/csv/rejects/csv_rejects_double_table.test +++ /dev/null @@ -1,33 +0,0 @@ -# name: test/sql/copy/csv/rejects/csv_rejects_double_table.test -# group: [rejects] - -require skip_reload - -# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n -require notwindows - -# Ensure that we can get the schema if we reduce the sample size and ignore errors -query IIIII -SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( - 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', - sample_size=1, - store_rejects=true); ----- -BIGINT VARCHAR 11044 11044 2 - - -query IIIIIIIIIIIII -SELECT * -FROM reject_scans order by all; ----- -3 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL store_rejects=true, sample_size=1 -3 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL store_rejects=true, sample_size=1 - -query IIIIIIIII -SELECT * -FROM reject_errors order by all; ----- -3 0 2176 10875 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -3 0 4176 20875 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -3 1 3680 18395 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -3 1 5680 28395 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' \ No newline at end of file diff --git a/test/sql/copy/csv/rejects/csv_rejects_two_tables.test b/test/sql/copy/csv/rejects/csv_rejects_two_tables.test new file mode 100644 index 000000000000..e9ad454f6052 --- /dev/null +++ b/test/sql/copy/csv/rejects/csv_rejects_two_tables.test @@ -0,0 +1,241 @@ +# name: test/sql/copy/csv/rejects/csv_rejects_two_tables.test +# group: [rejects] + +require skip_reload + +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + +# Ensure that we can get the schema if we reduce the sample size and ignore errors +query IIIII +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + store_rejects=true); +---- +BIGINT VARCHAR 11044 11044 2 + + +query IIIIIIIIIIIII +SELECT * +FROM reject_scans order by all; +---- +3 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL store_rejects=true, sample_size=1 +3 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL store_rejects=true, sample_size=1 + +query IIIIIIIII +SELECT * +FROM reject_errors order by all; +---- +3 0 2176 10875 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +3 0 4176 20875 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +3 1 3680 18395 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +3 1 5680 28395 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' + +# Test giving the name of errors table +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_table = 'rejects_errors_2'); +---- +Reject Scan Table name "reject_scans" is already in use. Either drop the used name(s), or give other name options in the CSV Reader function. + +statement ok +drop table reject_scans; + +query IIIII +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_table = 'rejects_errors_2' + ); +---- +BIGINT VARCHAR 11044 11044 2 + +query IIIIIIIIIIIII +SELECT * +FROM reject_scans order by all; +---- +8 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 false {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_2', sample_size=1 +8 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 false {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_2', sample_size=1 + +query IIIIIIIII +SELECT * +FROM rejects_errors_2 order by all; +---- +8 0 2176 10875 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +8 0 4176 20875 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +8 1 3680 18395 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +8 1 5680 28395 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' + +statement ok +drop table reject_errors; + +# Test giving the name of scans table +query IIIII +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_scan = 'rejects_scan_2'); +---- +BIGINT VARCHAR 11044 11044 2 + +query IIIIIIIIIIIII +SELECT * +FROM rejects_scan_2 order by all; +---- +12 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_2', sample_size=1 +12 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_2', sample_size=1 + +query IIIIIIIII +SELECT * +FROM reject_errors order by all; +---- +12 0 2176 10875 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +12 0 4176 20875 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +12 1 3680 18395 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +12 1 5680 28395 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' + + +# Test giving the name of both tables +query IIIII +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_scan = 'rejects_scan_3', + rejects_table = 'rejects_errors_3' + ); +---- +BIGINT VARCHAR 11044 11044 2 + +query IIIIIIIIIIIII +SELECT * +FROM rejects_scan_3 order by all; +---- +15 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_3', rejects_scan='rejects_scan_3', sample_size=1 +15 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_3', rejects_scan='rejects_scan_3', sample_size=1 + +query IIIIIIIII +SELECT * +FROM rejects_errors_3 order by all; +---- +15 0 2176 10875 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +15 0 4176 20875 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +15 1 3680 18395 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +15 1 5680 28395 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' + +statement ok +drop table reject_errors; + +statement ok +drop table reject_scans; + + +# Test giving the name of an existing table to the errors table +statement ok +create temporary table t (a integer); + +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_table = 't' + ); +---- +Reject Error Table name "t" is already in use. Either drop the used name(s), or give other name options in the CSV Reader function. + +# Test giving the name of an existing table to the scans table + +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_scan = 't' + ); +---- +Reject Scan Table name "t" is already in use. Either drop the used name(s), or give other name options in the CSV Reader function. + +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_table = 't', + rejects_scan = 't' + ); +---- +Reject Scan Table name "t" is already in use. Reject Error Table name "t" is already in use. Either drop the used name(s), or give other name options in the CSV Reader function. + + +# Test giving the name of the tables with store_rejects and/or ignore_errors set to false throws +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_scan = 'rejects_scan_3', + rejects_table = 'rejects_errors_3', + ignore_errors = false + ); +---- +STORE_REJECTS option is only supported when IGNORE_ERRORS is not manually set to false + +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + store_rejects = true, + ignore_errors = false + ); +---- +STORE_REJECTS option is only supported when IGNORE_ERRORS is not manually set to false + +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_table = 'rejects_errors_3', + ignore_errors = false + ); +---- +STORE_REJECTS option is only supported when IGNORE_ERRORS is not manually set to false + +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_scan = 'rejects_scan_3', + ignore_errors = false + ); +---- +STORE_REJECTS option is only supported when IGNORE_ERRORS is not manually set to false + +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_scan = 'rejects_scan_3', + rejects_table = 'rejects_errors_3', + store_rejects = false + ); +---- +REJECTS_TABLE option is only supported when store_rejects is not manually set to false + +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_table = 'rejects_errors_3', + store_rejects = false + ); +---- +REJECTS_TABLE option is only supported when store_rejects is not manually set to false + +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_scan = 'rejects_scan_3', + store_rejects = false + ); +---- +REJECTS_SCAN option is only supported when store_rejects is not manually set to false \ No newline at end of file From baca88167e12a3233aaa7aaee83a2b7ff1339c9e Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 13 Mar 2024 14:34:14 +0100 Subject: [PATCH 043/147] Lots of adjustments to make if possible to have cur_pos the exact place where an error happened and to produce multiple errors in the same row --- .../scanner/string_value_scanner.cpp | 121 +++++++++--------- .../csv_scanner/string_value_scanner.hpp | 28 ++-- 2 files changed, 70 insertions(+), 79 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index ae9d470a32ce..277924d8c01a 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -111,10 +111,6 @@ inline bool IsValueNull(const char *null_str_ptr, const char *value_ptr, const i } void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size, bool allocate) { - if (current_error.is_set) { - cur_col_id++; - return; - } if (cur_col_id >= number_of_columns) { bool error = true; if (cur_col_id == number_of_columns && ((quoted && state_machine.options.allow_quoted_nulls) || !quoted)) { @@ -122,7 +118,9 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size error = !IsValueNull(null_str_ptr, value_ptr, size); } if (error) { - current_error = {CSVErrorType::TOO_MANY_COLUMNS, cur_col_id}; + // We error pointing to the current value error. + current_errors.push_back( + {CSVErrorType::TOO_MANY_COLUMNS, cur_col_id, {iterator.pos.buffer_idx, last_position, buffer_size}}); } return; } @@ -143,7 +141,9 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size if (empty) { if (parse_types[chunk_col_id].first != LogicalTypeId::VARCHAR) { // If it is not a varchar, empty values are not accepted, we must error. - cast_errors[chunk_col_id] = std::string(""); + current_errors.push_back({CSVErrorType::CAST_ERROR, + cur_col_id, + {iterator.pos.buffer_idx, last_position, buffer_size}}); } static_cast(vector_ptr[chunk_col_id])[number_of_rows] = string_t(); } else { @@ -225,7 +225,8 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size HandleUnicodeError(cur_col_id, force_error); } // If we got here, we are ingoring errors, hence we must ignore this line. - current_error = {CSVErrorType::INVALID_UNICODE, cur_col_id}; + current_errors.push_back( + {CSVErrorType::INVALID_UNICODE, cur_col_id, {iterator.pos.buffer_idx, last_position, buffer_size}}); break; } if (allocate) { @@ -241,7 +242,13 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size } if (!success) { // We had a casting error, we push it here because we can only error when finishing the line read. - cast_errors[cur_col_id] = std::string(value_ptr, size); + std::ostringstream error; + // Casting Error Message + error << "Could not convert string \"" << std::string(value_ptr, size) << "\" to \'" + << LogicalTypeIdToString(parse_types[cur_col_id].first) << "\'"; + current_errors.push_back( + {CSVErrorType::INVALID_UNICODE, cur_col_id, {iterator.pos.buffer_idx, last_position, buffer_size}}); + current_errors.back().error_message = error.str(); } cur_col_id++; chunk_col_id++; @@ -282,7 +289,7 @@ void StringValueResult::Reset() { if (cur_buffer) { buffer_handles[cur_buffer->buffer_idx] = cur_buffer; } - current_error.Reset(); + current_errors.clear(); } void StringValueResult::AddQuotedValue(StringValueResult &result, const idx_t buffer_pos) { @@ -327,16 +334,6 @@ void StringValueResult::AddValue(StringValueResult &result, const idx_t buffer_p result.last_position = buffer_pos + 1; } -void StringValueResult::HandleOverLimitRows(idx_t col_idx) { - LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); - bool first_nl; - auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); - auto csv_error = - CSVError::IncorrectColumnAmountError(state_machine.options, col_idx, lines_per_batch, borked_line, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); - error_handler.Error(csv_error); -} - void StringValueResult::HandleUnicodeError(idx_t col_idx, bool force_error) { bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); @@ -351,35 +348,50 @@ void StringValueResult::HandleUnicodeError(idx_t col_idx, bool force_error) { error_handler.Error(csv_error, force_error); } -void StringValueResult::HandleUnterminatedQuotes(idx_t col_idx, bool force_error) { - LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); - bool first_nl; - auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); - auto csv_error = - CSVError::UnterminatedQuotesError(state_machine.options, col_idx, lines_per_batch, borked_line, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); - error_handler.Error(csv_error, force_error); -} - bool StringValueResult::HandleError() { - if (current_error.is_set) { - switch (current_error.type) { + // Reconstruct CSV Line + for (auto &cur_error : current_errors) { + LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); + bool first_nl; + auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); + CSVError csv_error; + auto col_idx = cur_error.col_idx; + auto &line_pos = cur_error.error_position; + + switch (cur_error.type) { case CSVErrorType::TOO_MANY_COLUMNS: - HandleOverLimitRows(cur_col_id); + csv_error = + CSVError::IncorrectColumnAmountError(state_machine.options, col_idx, lines_per_batch, borked_line, + line_pos.GetGlobalPosition(requested_size, first_nl)); break; - case CSVErrorType::INVALID_UNICODE: - HandleUnicodeError(current_error.col_idx); + case CSVErrorType::INVALID_UNICODE: { + // We have to sanitize the CSV line + std::vector char_array(borked_line.begin(), borked_line.end()); + char_array.push_back('\0'); // Null-terminate the character array + Utf8Proc::MakeValid(&char_array[0], char_array.size()); + borked_line = {char_array.begin(), char_array.end() - 1}; + csv_error = CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, + line_pos.GetGlobalPosition(requested_size, first_nl)); break; + } case CSVErrorType::UNTERMINATED_QUOTES: - HandleUnterminatedQuotes(current_error.col_idx); + csv_error = CSVError::UnterminatedQuotesError(state_machine.options, col_idx, lines_per_batch, borked_line, + line_pos.GetGlobalPosition(requested_size, first_nl)); + break; + case CSVErrorType::CAST_ERROR: + csv_error = CSVError::CastError(state_machine.options, names[cur_error.col_idx], cur_error.error_message, + cur_error.col_idx, borked_line, lines_per_batch, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); break; default: throw InvalidInputException("CSV Error not allowed when inserting row"); } + error_handler.Error(csv_error); + } + if (!current_errors.empty()) { + current_errors.clear(); cur_col_id = 0; chunk_col_id = 0; - // An error occurred on this row, we are ignoring it and resetting our control flag - current_error.Reset(); return true; } return false; @@ -455,30 +467,6 @@ bool StringValueResult::AddRowInternal() { if (HandleError()) { return false; } - if (!cast_errors.empty()) { - // A wild casting error appears - for (auto &cast_error : cast_errors) { - std::ostringstream error; - // Casting Error Message - error << "Could not convert string \"" << cast_error.second << "\" to \'" - << LogicalTypeIdToString(parse_types[cast_error.first].first) << "\'"; - auto error_string = error.str(); - LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); - bool first_nl; - auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); - auto csv_error = CSVError::CastError( - state_machine.options, names[cast_error.first], error_string, cast_error.first, borked_line, - lines_per_batch, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); - error_handler.Error(csv_error); - } - // If we got here it means we are ignoring errors, hence we need to signify to our result scanner to ignore this - // row - // Cleanup this line and continue - cast_errors.clear(); - cur_col_id = 0; - chunk_col_id = 0; - return false; - } NullPaddingQuotedNewlineCheck(); quoted_new_line = false; // We need to check if we are getting the correct number of columns here. @@ -559,7 +547,9 @@ void StringValueResult::InvalidState(StringValueResult &result) { if (force_error) { result.HandleUnicodeError(result.cur_col_id, force_error); } - result.current_error = {CSVErrorType::UNTERMINATED_QUOTES, result.cur_col_id}; + result.current_errors.push_back({CSVErrorType::INVALID_UNICODE, + result.cur_col_id, + {result.iterator.pos.buffer_idx, result.last_position, result.buffer_size}}); } bool StringValueResult::EmptyLine(StringValueResult &result, const idx_t buffer_pos) { @@ -1229,9 +1219,12 @@ void StringValueScanner::FinalizeChunkProcess() { // If we are not done we have two options. // 1) If a boundary is set. if (iterator.IsBoundarySet()) { - if (!(result.current_error == CSVErrorType::UNTERMINATED_QUOTES)) { - iterator.done = true; + for (auto &cur_error : result.current_errors) { + if (!(cur_error == CSVErrorType::UNTERMINATED_QUOTES)) { + iterator.done = true; + } } + // We read until the next line or until we have nothing else to read. // Move to next buffer if (!cur_buffer_handle) { diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index e36266d90f69..4a54bc7dcccc 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -64,16 +64,17 @@ class FullLinePosition { class CurrentError { public: - CurrentError() : is_set(false) {}; - CurrentError(CSVErrorType type, idx_t col_idx_p) : is_set(true), type(type), col_idx(col_idx_p) {}; - void Reset() { - is_set = false; - } - bool is_set; + CurrentError(CSVErrorType type, idx_t col_idx_p, LinePosition error_position_p) + : type(type), col_idx(col_idx_p), error_position(error_position_p) {}; + CSVErrorType type; idx_t col_idx; + string error_message; + //! Exact Position where the error happened + LinePosition error_position; + friend bool operator==(const CurrentError &error, CSVErrorType other) { - return error.is_set && error.type == other; + return error.type == other; } }; @@ -81,8 +82,8 @@ class StringValueResult : public ScannerResult { public: StringValueResult(CSVStates &states, CSVStateMachine &state_machine, const shared_ptr &buffer_handle, Allocator &buffer_allocator, idx_t result_size, - idx_t buffer_position, CSVErrorHandler &error_hander, CSVIterator &iterator, bool store_line_size, - shared_ptr csv_file_scan, idx_t &lines_read, bool sniffing); + idx_t buffer_position, CSVErrorHandler &error_handler, CSVIterator &iterator, + bool store_line_size, shared_ptr csv_file_scan, idx_t &lines_read, bool sniffing); ~StringValueResult(); @@ -120,7 +121,6 @@ class StringValueResult : public ScannerResult { unsafe_unique_array> parse_types; vector names; - unordered_map cast_errors; shared_ptr csv_file_scan; idx_t &lines_read; @@ -135,8 +135,8 @@ class StringValueResult : public ScannerResult { //! Requested size of buffers (i.e., either 32Mb or set by buffer_size parameter) idx_t requested_size; - //! Current Error if any - CurrentError current_error; + //! Errors happening in the current line (if any) + vector current_errors; bool sniffing; //! Specialized code for quoted values, makes sure to remove quotes and escapes @@ -153,10 +153,8 @@ class StringValueResult : public ScannerResult { //! Handles EmptyLine states static inline bool EmptyLine(StringValueResult &result, const idx_t buffer_pos); inline bool AddRowInternal(); - - void HandleOverLimitRows(idx_t col_idx); void HandleUnicodeError(idx_t col_idx, bool force_error = false); - void HandleUnterminatedQuotes(idx_t col_idx, bool force_error = false); + //! Certain errors should only be handled when adding the line, to ensure proper error propagation. bool HandleError(); inline void AddValueToVector(const char *value_ptr, const idx_t size, bool allocate = false); From 2fe296f606f1543c152d493a0449a27cc6672025 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 13 Mar 2024 14:46:23 +0100 Subject: [PATCH 044/147] More adjustments --- .../scanner/string_value_scanner.cpp | 26 ++++++++++++------- .../csv_scanner/string_value_scanner.hpp | 3 ++- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 277924d8c01a..144e8ea9d679 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -216,17 +216,17 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size break; } default: { - // By default we add a string + // By default, we add a string // We only evaluate if a string is utf8 valid, if it's actually a varchar if (parse_types[chunk_col_id].second && !Utf8Proc::IsValid(value_ptr, UnsafeNumericCast(size))) { bool force_error = !state_machine.options.ignore_errors.GetValue() && sniffing; + LinePosition error_position {iterator.pos.buffer_idx, last_position, buffer_size}; // Invalid unicode, we must error if (force_error) { - HandleUnicodeError(cur_col_id, force_error); + HandleUnicodeError(cur_col_id, error_position); } // If we got here, we are ingoring errors, hence we must ignore this line. - current_errors.push_back( - {CSVErrorType::INVALID_UNICODE, cur_col_id, {iterator.pos.buffer_idx, last_position, buffer_size}}); + current_errors.push_back({CSVErrorType::INVALID_UNICODE, cur_col_id, error_position}); break; } if (allocate) { @@ -334,7 +334,7 @@ void StringValueResult::AddValue(StringValueResult &result, const idx_t buffer_p result.last_position = buffer_pos + 1; } -void StringValueResult::HandleUnicodeError(idx_t col_idx, bool force_error) { +void StringValueResult::HandleUnicodeError(idx_t col_idx, LinePosition &error_position) { bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); // sanitize borked line @@ -344,8 +344,8 @@ void StringValueResult::HandleUnicodeError(idx_t col_idx, bool force_error) { borked_line = {char_array.begin(), char_array.end() - 1}; LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); auto csv_error = CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); - error_handler.Error(csv_error, force_error); + error_position.GetGlobalPosition(requested_size, first_nl)); + error_handler.Error(csv_error, true); } bool StringValueResult::HandleError() { @@ -381,7 +381,7 @@ bool StringValueResult::HandleError() { case CSVErrorType::CAST_ERROR: csv_error = CSVError::CastError(state_machine.options, names[cur_error.col_idx], cur_error.error_message, cur_error.col_idx, borked_line, lines_per_batch, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); + line_pos.GetGlobalPosition(requested_size, first_nl)); break; default: throw InvalidInputException("CSV Error not allowed when inserting row"); @@ -498,9 +498,10 @@ bool StringValueResult::AddRowInternal() { bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); + LinePosition error_position {iterator.pos.buffer_idx, last_position, buffer_size}; auto csv_error = CSVError::IncorrectColumnAmountError( state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); + error_position.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); // If we are here we ignore_errors, so we delete this line number_of_rows--; @@ -545,7 +546,8 @@ void StringValueResult::InvalidState(StringValueResult &result) { bool force_error = !result.state_machine.options.ignore_errors.GetValue() && result.sniffing; // Invalid unicode, we must error if (force_error) { - result.HandleUnicodeError(result.cur_col_id, force_error); + LinePosition error_position {result.iterator.pos.buffer_idx, result.last_position, result.buffer_size}; + result.HandleUnicodeError(result.cur_col_id, error_position); } result.current_errors.push_back({CSVErrorType::INVALID_UNICODE, result.cur_col_id, @@ -722,6 +724,8 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { bool first_nl; auto borked_line = result.line_positions_per_row[line_error].ReconstructCurrentLine(first_nl, result.buffer_handles); + // TODO: We can't really nicely get the position where this error happened, this should be solved by + // TODO: adding more types to implicit casting instead of relying on this flush. auto csv_error = CSVError::CastError( state_machine->options, csv_file_scan->names[col_idx], error_message, col_idx, borked_line, lines_per_batch, @@ -744,6 +748,8 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { bool first_nl; auto borked_line = result.line_positions_per_row[line_error].ReconstructCurrentLine( first_nl, result.buffer_handles); + // TODO: We can't really nicely get the position where this error happened, this should be solved by + // TODO: adding more types to implicit casting instead of relying on this flush. auto csv_error = CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], error_message, col_idx, borked_line, lines_per_batch, diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 4a54bc7dcccc..dfbe1f581bd5 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -153,7 +153,8 @@ class StringValueResult : public ScannerResult { //! Handles EmptyLine states static inline bool EmptyLine(StringValueResult &result, const idx_t buffer_pos); inline bool AddRowInternal(); - void HandleUnicodeError(idx_t col_idx, bool force_error = false); + //! Force the throw of a unicode error + void HandleUnicodeError(idx_t col_idx, LinePosition &error_position); //! Certain errors should only be handled when adding the line, to ensure proper error propagation. bool HandleError(); From dcdb16a1f276a5babf33aeee1adbd43726f3fed2 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 13 Mar 2024 16:01:24 +0100 Subject: [PATCH 045/147] Add a new return option for start of the line as a byte position --- .../scanner/string_value_scanner.cpp | 41 +++++++++++-------- .../table_function/global_csv_state.cpp | 22 ++++++---- .../operator/csv_scanner/util/csv_error.cpp | 38 +++++++++-------- .../operator/persistent/csv_rejects_table.cpp | 14 ++++--- .../operator/csv_scanner/csv_error.hpp | 19 +++++---- 5 files changed, 80 insertions(+), 54 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 144e8ea9d679..103ff6341a65 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -247,7 +247,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size error << "Could not convert string \"" << std::string(value_ptr, size) << "\" to \'" << LogicalTypeIdToString(parse_types[cur_col_id].first) << "\'"; current_errors.push_back( - {CSVErrorType::INVALID_UNICODE, cur_col_id, {iterator.pos.buffer_idx, last_position, buffer_size}}); + {CSVErrorType::CAST_ERROR, cur_col_id, {iterator.pos.buffer_idx, last_position, buffer_size}}); current_errors.back().error_message = error.str(); } cur_col_id++; @@ -344,6 +344,7 @@ void StringValueResult::HandleUnicodeError(idx_t col_idx, LinePosition &error_po borked_line = {char_array.begin(), char_array.end() - 1}; LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); auto csv_error = CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), error_position.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error, true); } @@ -360,9 +361,10 @@ bool StringValueResult::HandleError() { switch (cur_error.type) { case CSVErrorType::TOO_MANY_COLUMNS: - csv_error = - CSVError::IncorrectColumnAmountError(state_machine.options, col_idx, lines_per_batch, borked_line, - line_pos.GetGlobalPosition(requested_size, first_nl)); + csv_error = CSVError::IncorrectColumnAmountError( + state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + line_pos.GetGlobalPosition(requested_size, first_nl)); break; case CSVErrorType::INVALID_UNICODE: { // We have to sanitize the CSV line @@ -371,16 +373,20 @@ bool StringValueResult::HandleError() { Utf8Proc::MakeValid(&char_array[0], char_array.size()); borked_line = {char_array.begin(), char_array.end() - 1}; csv_error = CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), line_pos.GetGlobalPosition(requested_size, first_nl)); break; } case CSVErrorType::UNTERMINATED_QUOTES: - csv_error = CSVError::UnterminatedQuotesError(state_machine.options, col_idx, lines_per_batch, borked_line, - line_pos.GetGlobalPosition(requested_size, first_nl)); + csv_error = CSVError::UnterminatedQuotesError( + state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + line_pos.GetGlobalPosition(requested_size, first_nl)); break; case CSVErrorType::CAST_ERROR: csv_error = CSVError::CastError(state_machine.options, names[cur_error.col_idx], cur_error.error_message, cur_error.col_idx, borked_line, lines_per_batch, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), line_pos.GetGlobalPosition(requested_size, first_nl)); break; default: @@ -404,7 +410,7 @@ void StringValueResult::QuotedNewLine(StringValueResult &result) { void StringValueResult::NullPaddingQuotedNewlineCheck() { // We do some checks for null_padding correctness if (state_machine.options.null_padding && iterator.IsBoundarySet() && quoted_new_line && iterator.done) { - // If we have null_padding set, we found a quoted new line, we are scanning the file in parallel and it's the + // If we have null_padding set, we found a quoted new line, we are scanning the file in parallel, and it's the // last row of this thread. LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); auto csv_error = CSVError::NullPaddingFail(state_machine.options, lines_per_batch); @@ -501,6 +507,7 @@ bool StringValueResult::AddRowInternal() { LinePosition error_position {iterator.pos.buffer_idx, last_position, buffer_size}; auto csv_error = CSVError::IncorrectColumnAmountError( state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), error_position.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); // If we are here we ignore_errors, so we delete this line @@ -549,7 +556,7 @@ void StringValueResult::InvalidState(StringValueResult &result) { LinePosition error_position {result.iterator.pos.buffer_idx, result.last_position, result.buffer_size}; result.HandleUnicodeError(result.cur_col_id, error_position); } - result.current_errors.push_back({CSVErrorType::INVALID_UNICODE, + result.current_errors.push_back({CSVErrorType::UNTERMINATED_QUOTES, result.cur_col_id, {result.iterator.pos.buffer_idx, result.last_position, result.buffer_size}}); } @@ -724,12 +731,11 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { bool first_nl; auto borked_line = result.line_positions_per_row[line_error].ReconstructCurrentLine(first_nl, result.buffer_handles); - // TODO: We can't really nicely get the position where this error happened, this should be solved by - // TODO: adding more types to implicit casting instead of relying on this flush. auto csv_error = CSVError::CastError( state_machine->options, csv_file_scan->names[col_idx], error_message, col_idx, borked_line, lines_per_batch, - result.line_positions_per_row[line_error].begin.GetGlobalPosition(result.result_size, first_nl)); + result.line_positions_per_row[line_error].begin.GetGlobalPosition(result.result_size, first_nl), + -1); error_handler->Error(csv_error); } borked_lines.insert(line_error++); @@ -748,13 +754,11 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { bool first_nl; auto borked_line = result.line_positions_per_row[line_error].ReconstructCurrentLine( first_nl, result.buffer_handles); - // TODO: We can't really nicely get the position where this error happened, this should be solved by - // TODO: adding more types to implicit casting instead of relying on this flush. - auto csv_error = - CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], error_message, - col_idx, borked_line, lines_per_batch, - result.line_positions_per_row[line_error].begin.GetGlobalPosition( - result.result_size, first_nl)); + auto csv_error = CSVError::CastError( + state_machine->options, csv_file_scan->names[col_idx], error_message, col_idx, borked_line, + lines_per_batch, + result.line_positions_per_row[line_error].begin.GetGlobalPosition(result.result_size, first_nl), + -1); error_handler->Error(csv_error); } @@ -1249,6 +1253,7 @@ void StringValueScanner::FinalizeChunkProcess() { } else { result.HandleError(); } + iterator.done = FinishedFile(); } else { // 2) If a boundary is not set // We read until the chunk is complete, or we have nothing else to read. diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index ae2fddf9df0b..3f819bbf170c 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -268,11 +268,19 @@ void CSVGlobalState::FillRejectsTable() { errors_appender.Append(file_idx); // 3. Row Line errors_appender.Append(row_line); - // 4. Byte Position where error occurred - errors_appender.Append(error.byte_position); - // 5. Column Index + // 4. Byte Position of the row error + errors_appender.Append(error.row_byte_position); + // 5. Byte Position where error occurred + if (error.byte_position == -1) { + // This means this error comes from a flush, and we don't support this yet, so we give it + // a null + errors_appender.Append(Value()); + } else { + errors_appender.Append(error.byte_position); + } + // 6. Column Index errors_appender.Append(col_idx + 1); - // 6. Column Name (If Applicable) + // 7. Column Name (If Applicable) switch (error.type) { case CSVErrorType::TOO_MANY_COLUMNS: errors_appender.Append(Value()); @@ -284,11 +292,11 @@ void CSVGlobalState::FillRejectsTable() { default: errors_appender.Append(string_t(bind_data.return_names[col_idx])); } - // 7. Error Type + // 8. Error Type errors_appender.Append(string_t(CSVErrorTypeToEnum(error.type))); - // 8. Original CSV Line + // 9. Original CSV Line errors_appender.Append(string_t(error.csv_row)); - // 9. Full Error Message + // 10. Full Error Message errors_appender.Append(string_t(error.error_message)); errors_appender.EndRow(); } diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index 7a8349288c07..1f93d945fd91 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -82,9 +82,10 @@ CSVError::CSVError(string error_message_p, CSVErrorType type_p, LinesPerBoundary } CSVError::CSVError(string error_message_p, CSVErrorType type_p, idx_t column_idx_p, string csv_row_p, - LinesPerBoundary error_info_p, idx_t byte_position_p, const CSVReaderOptions &reader_options) + LinesPerBoundary error_info_p, idx_t row_byte_position, int64_t byte_position_p, + const CSVReaderOptions &reader_options) : error_message(std::move(error_message_p)), type(type_p), column_idx(column_idx_p), csv_row(std::move(csv_row_p)), - error_info(error_info_p), byte_position(byte_position_p) { + error_info(error_info_p), row_byte_position(row_byte_position), byte_position(byte_position_p) { // What were the options std::ostringstream error; error << error_message << std::endl; @@ -114,13 +115,15 @@ CSVError CSVError::ColumnTypesError(case_insensitive_map_t sql_types_per_ } CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, idx_t column_idx, - string &csv_row, LinesPerBoundary error_info, idx_t byte_position) { + string &csv_row, LinesPerBoundary error_info, idx_t row_byte_position, + int64_t byte_position) { std::ostringstream error; // Which column error << "Error when converting column \"" << column_name << "\". "; // What was the cast error error << cast_error; - return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, csv_row, error_info, byte_position, options); + return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, csv_row, error_info, row_byte_position, + byte_position, options); } CSVError CSVError::LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info, @@ -128,7 +131,8 @@ CSVError CSVError::LineSizeError(const CSVReaderOptions &options, idx_t actual_s std::ostringstream error; error << "Maximum line size of " << options.maximum_line_size << " bytes exceeded. "; error << "Actual Size:" << actual_size << " bytes."; - return CSVError(error.str(), CSVErrorType::MAXIMUM_LINE_SIZE, 0, csv_row, error_info, byte_position, options); + return CSVError(error.str(), CSVErrorType::MAXIMUM_LINE_SIZE, 0, csv_row, error_info, byte_position, byte_position, + options); } CSVError CSVError::SniffingError(string &file_path) { @@ -150,34 +154,36 @@ CSVError CSVError::NullPaddingFail(const CSVReaderOptions &options, LinesPerBoun } CSVError CSVError::UnterminatedQuotesError(const CSVReaderOptions &options, idx_t current_column, - LinesPerBoundary error_info, string &csv_row, idx_t byte_position) { + LinesPerBoundary error_info, string &csv_row, idx_t row_byte_position, + int64_t byte_position) { std::ostringstream error; error << "Value with unterminated quote found."; - return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, current_column, csv_row, error_info, byte_position, - options); + return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, current_column, csv_row, error_info, + row_byte_position, byte_position, options); } CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, idx_t actual_columns, - LinesPerBoundary error_info, string &csv_row, idx_t byte_position) { + LinesPerBoundary error_info, string &csv_row, idx_t row_byte_position, + int64_t byte_position) { std::ostringstream error; // How many columns were expected and how many were found error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns + 1; if (actual_columns >= options.dialect_options.num_cols) { - return CSVError(error.str(), CSVErrorType::TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, byte_position, - options); + return CSVError(error.str(), CSVErrorType::TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, + row_byte_position, byte_position, options); } else { - return CSVError(error.str(), CSVErrorType::TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, byte_position, - options); + return CSVError(error.str(), CSVErrorType::TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, + row_byte_position, byte_position, options); } } CSVError CSVError::InvalidUTF8(const CSVReaderOptions &options, idx_t current_column, LinesPerBoundary error_info, - string &csv_row, idx_t byte_position) { + string &csv_row, idx_t row_byte_position, int64_t byte_position) { std::ostringstream error; // How many columns were expected and how many were found error << "Invalid unicode (byte sequence mismatch) detected."; - return CSVError(error.str(), CSVErrorType::INVALID_UNICODE, current_column, csv_row, error_info, byte_position, - options); + return CSVError(error.str(), CSVErrorType::INVALID_UNICODE, current_column, csv_row, error_info, row_byte_position, + byte_position, options); } bool CSVErrorHandler::PrintLineNumber(CSVError &error) { diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index 429d385553e2..31f63d0279b8 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -104,17 +104,19 @@ void CSVRejectsTable::InitializeTable(ClientContext &context, const ReadCSVData info->columns.AddColumn(ColumnDefinition("file_id", LogicalType::UBIGINT)); // 2. Row Line info->columns.AddColumn(ColumnDefinition("line", LogicalType::UBIGINT)); - // 3. Byte Position where error occurred + // 3. Byte Position of the start of the line + info->columns.AddColumn(ColumnDefinition("line_byte_position", LogicalType::UBIGINT)); + // 4. Byte Position where error occurred info->columns.AddColumn(ColumnDefinition("byte_position", LogicalType::UBIGINT)); - // 4. Column Index (If Applicable) + // 5. Column Index (If Applicable) info->columns.AddColumn(ColumnDefinition("column_idx", LogicalType::UBIGINT)); - // 5. Column Name (If Applicable) + // 6. Column Name (If Applicable) info->columns.AddColumn(ColumnDefinition("column_name", LogicalType::VARCHAR)); - // 6. Error Type + // 7. Error Type info->columns.AddColumn(ColumnDefinition("error_type", enum_type)); - // 7. Original CSV Line + // 8. Original CSV Line info->columns.AddColumn(ColumnDefinition("csv_line", LogicalType::VARCHAR)); - // 8. Full Error Message + // 9. Full Error Message info->columns.AddColumn(ColumnDefinition("error_message", LogicalType::VARCHAR)); catalog.CreateTable(context, std::move(info)); } diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index 98f460127d83..d6a6ce7ca657 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -52,13 +52,14 @@ class CSVError { public: CSVError() {}; CSVError(string error_message, CSVErrorType type, idx_t column_idx, string csv_row, LinesPerBoundary error_info, - idx_t byte_position, const CSVReaderOptions &reader_options); + idx_t row_byte_position, int64_t byte_position, const CSVReaderOptions &reader_options); CSVError(string error_message, CSVErrorType type, LinesPerBoundary error_info); //! Produces error messages for column name -> type mismatch. static CSVError ColumnTypesError(case_insensitive_map_t sql_types_per_column, const vector &names); //! Produces error messages for casting errors static CSVError CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, - idx_t column_idx, string &csv_row, LinesPerBoundary error_info, idx_t byte_position); + idx_t column_idx, string &csv_row, LinesPerBoundary error_info, idx_t row_byte_position, + int64_t byte_position); //! Produces error for when the line size exceeds the maximum line size option static CSVError LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info, string &csv_row, idx_t byte_position); @@ -66,14 +67,16 @@ class CSVError { static CSVError SniffingError(string &file_path); //! Produces error messages for unterminated quoted values static CSVError UnterminatedQuotesError(const CSVReaderOptions &options, idx_t current_column, - LinesPerBoundary error_info, string &csv_row, idx_t byte_position); + LinesPerBoundary error_info, string &csv_row, idx_t row_byte_position, + int64_t byte_position); //! Produces error messages for null_padding option is set and we have quoted new values in parallel static CSVError NullPaddingFail(const CSVReaderOptions &options, LinesPerBoundary error_info); //! Produces error for incorrect (e.g., smaller and lower than the predefined) number of columns in a CSV Line static CSVError IncorrectColumnAmountError(const CSVReaderOptions &state_machine, idx_t actual_columns, - LinesPerBoundary error_info, string &csv_row, idx_t byte_position); + LinesPerBoundary error_info, string &csv_row, idx_t row_byte_position, + int64_t byte_position); static CSVError InvalidUTF8(const CSVReaderOptions &options, idx_t current_column, LinesPerBoundary error_info, - string &csv_row, idx_t byte_position); + string &csv_row, idx_t row_byte_position, int64_t byte_position); idx_t GetBoundaryIndex() { return error_info.boundary_idx; @@ -91,8 +94,10 @@ class CSVError { string csv_row; //! Line information regarding this error LinesPerBoundary error_info; - //! Global Byte Position where error occurred. - idx_t byte_position; + //! Byte position of where the row starts + idx_t row_byte_position; + //! Byte Position where error occurred. + int64_t byte_position; }; class CSVErrorHandler { From 1f4270c335e98c8106de30b5af3b062ba5143ca0 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 13 Mar 2024 16:27:48 +0100 Subject: [PATCH 046/147] More on the byte per row and per value --- data/csv/rejects/unquoted/unquoted_last_value.csv | 2 +- .../operator/csv_scanner/scanner/string_value_scanner.cpp | 4 ++++ .../operator/csv_scanner/table_function/global_csv_state.cpp | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/data/csv/rejects/unquoted/unquoted_last_value.csv b/data/csv/rejects/unquoted/unquoted_last_value.csv index 0d714083e9c8..68dec7d40d9c 100644 --- a/data/csv/rejects/unquoted/unquoted_last_value.csv +++ b/data/csv/rejects/unquoted/unquoted_last_value.csv @@ -2,4 +2,4 @@ "bla" "bla" "bla" -"bla +"bla \ No newline at end of file diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 103ff6341a65..0b300ee30d00 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -1030,6 +1030,10 @@ bool StringValueScanner::MoveToNextBuffer() { lines_read++; } else if (states.IsQuotedCurrent()) { // Unterminated quote + LinePosition current_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, + result.buffer_size}; + result.current_line_position.begin = result.current_line_position.end; + result.current_line_position.end = current_line_start; result.InvalidState(result); } else { result.AddRow(result, previous_buffer_handle->actual_size); diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 3f819bbf170c..94951168ddd2 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -269,7 +269,7 @@ void CSVGlobalState::FillRejectsTable() { // 3. Row Line errors_appender.Append(row_line); // 4. Byte Position of the row error - errors_appender.Append(error.row_byte_position); + errors_appender.Append(error.row_byte_position + 1); // 5. Byte Position where error occurred if (error.byte_position == -1) { // This means this error comes from a flush, and we don't support this yet, so we give it From ab2e9b13fb8177869916dd48112fba555bb4c8b4 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 14 Mar 2024 13:42:33 +0100 Subject: [PATCH 047/147] Change last_position to be a LinePosition --- .../scanner/string_value_scanner.cpp | 72 +++++++++---------- .../csv_scanner/string_value_scanner.hpp | 2 +- 2 files changed, 33 insertions(+), 41 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 0b300ee30d00..2b3eaf252a9f 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -28,7 +28,7 @@ StringValueResult::StringValueResult(CSVStates &states, CSVStateMachine &state_m // Buffer Information buffer_ptr = buffer_handle->Ptr(); buffer_size = buffer_handle->actual_size; - last_position = buffer_position; + last_position = {buffer_handle->buffer_idx, buffer_position, buffer_size}; requested_size = buffer_handle->requested_size; // Current Result information @@ -119,8 +119,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size } if (error) { // We error pointing to the current value error. - current_errors.push_back( - {CSVErrorType::TOO_MANY_COLUMNS, cur_col_id, {iterator.pos.buffer_idx, last_position, buffer_size}}); + current_errors.push_back({CSVErrorType::TOO_MANY_COLUMNS, cur_col_id, last_position}); } return; } @@ -141,9 +140,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size if (empty) { if (parse_types[chunk_col_id].first != LogicalTypeId::VARCHAR) { // If it is not a varchar, empty values are not accepted, we must error. - current_errors.push_back({CSVErrorType::CAST_ERROR, - cur_col_id, - {iterator.pos.buffer_idx, last_position, buffer_size}}); + current_errors.push_back({CSVErrorType::CAST_ERROR, cur_col_id, last_position}); } static_cast(vector_ptr[chunk_col_id])[number_of_rows] = string_t(); } else { @@ -220,13 +217,12 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size // We only evaluate if a string is utf8 valid, if it's actually a varchar if (parse_types[chunk_col_id].second && !Utf8Proc::IsValid(value_ptr, UnsafeNumericCast(size))) { bool force_error = !state_machine.options.ignore_errors.GetValue() && sniffing; - LinePosition error_position {iterator.pos.buffer_idx, last_position, buffer_size}; // Invalid unicode, we must error if (force_error) { - HandleUnicodeError(cur_col_id, error_position); + HandleUnicodeError(cur_col_id, last_position); } // If we got here, we are ingoring errors, hence we must ignore this line. - current_errors.push_back({CSVErrorType::INVALID_UNICODE, cur_col_id, error_position}); + current_errors.push_back({CSVErrorType::INVALID_UNICODE, cur_col_id, last_position}); break; } if (allocate) { @@ -246,8 +242,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size // Casting Error Message error << "Could not convert string \"" << std::string(value_ptr, size) << "\" to \'" << LogicalTypeIdToString(parse_types[cur_col_id].first) << "\'"; - current_errors.push_back( - {CSVErrorType::CAST_ERROR, cur_col_id, {iterator.pos.buffer_idx, last_position, buffer_size}}); + current_errors.push_back({CSVErrorType::CAST_ERROR, cur_col_id, last_position}); current_errors.back().error_message = error.str(); } cur_col_id++; @@ -309,7 +304,7 @@ void StringValueResult::AddQuotedValue(StringValueResult &result, const idx_t bu result.parse_chunk.data[result.chunk_col_id]); result.AddValueToVector(value.GetData(), value.GetSize()); } else { - if (buffer_pos < result.last_position + 2) { + if (buffer_pos < result.last_position.buffer_pos + 2) { // empty value auto value = string_t(); result.AddValueToVector(value.GetData(), value.GetSize()); @@ -323,15 +318,16 @@ void StringValueResult::AddQuotedValue(StringValueResult &result, const idx_t bu } void StringValueResult::AddValue(StringValueResult &result, const idx_t buffer_pos) { - if (result.last_position > buffer_pos) { + if (result.last_position.buffer_pos > buffer_pos) { return; } if (result.quoted) { StringValueResult::AddQuotedValue(result, buffer_pos); } else { - result.AddValueToVector(result.buffer_ptr + result.last_position, buffer_pos - result.last_position); + result.AddValueToVector(result.buffer_ptr + result.last_position.buffer_pos, + buffer_pos - result.last_position.buffer_pos); } - result.last_position = buffer_pos + 1; + result.last_position.buffer_pos = buffer_pos + 1; } void StringValueResult::HandleUnicodeError(idx_t col_idx, LinePosition &error_position) { @@ -504,11 +500,10 @@ bool StringValueResult::AddRowInternal() { bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); - LinePosition error_position {iterator.pos.buffer_idx, last_position, buffer_size}; auto csv_error = CSVError::IncorrectColumnAmountError( state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl), - error_position.GetGlobalPosition(requested_size, first_nl)); + last_position.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); // If we are here we ignore_errors, so we delete this line number_of_rows--; @@ -526,22 +521,23 @@ bool StringValueResult::AddRowInternal() { } bool StringValueResult::AddRow(StringValueResult &result, const idx_t buffer_pos) { - if (result.last_position <= buffer_pos) { + if (result.last_position.buffer_pos <= buffer_pos) { // We add the value if (result.quoted) { StringValueResult::AddQuotedValue(result, buffer_pos); } else { - result.AddValueToVector(result.buffer_ptr + result.last_position, buffer_pos - result.last_position); + result.AddValueToVector(result.buffer_ptr + result.last_position.buffer_pos, + buffer_pos - result.last_position.buffer_pos); } if (result.state_machine.dialect_options.state_machine_options.new_line == NewLineIdentifier::CARRY_ON) { if (result.states.states[1] == CSVState::RECORD_SEPARATOR) { // Even though this is marked as a carry on, this is a hippie mixie - result.last_position = buffer_pos + 1; + result.last_position.buffer_pos = buffer_pos + 1; } else { - result.last_position = buffer_pos + 2; + result.last_position.buffer_pos = buffer_pos + 2; } } else { - result.last_position = buffer_pos + 1; + result.last_position.buffer_pos = buffer_pos + 1; } } @@ -553,20 +549,17 @@ void StringValueResult::InvalidState(StringValueResult &result) { bool force_error = !result.state_machine.options.ignore_errors.GetValue() && result.sniffing; // Invalid unicode, we must error if (force_error) { - LinePosition error_position {result.iterator.pos.buffer_idx, result.last_position, result.buffer_size}; - result.HandleUnicodeError(result.cur_col_id, error_position); + result.HandleUnicodeError(result.cur_col_id, result.last_position); } - result.current_errors.push_back({CSVErrorType::UNTERMINATED_QUOTES, - result.cur_col_id, - {result.iterator.pos.buffer_idx, result.last_position, result.buffer_size}}); + result.current_errors.push_back({CSVErrorType::UNTERMINATED_QUOTES, result.cur_col_id, result.last_position}); } bool StringValueResult::EmptyLine(StringValueResult &result, const idx_t buffer_pos) { // We care about empty lines if this is a single column csv file - result.last_position = buffer_pos + 1; + result.last_position = {result.iterator.pos.buffer_idx, result.iterator.pos.buffer_pos + 1, result.buffer_size}; if (result.states.IsCarriageReturn() && result.state_machine.dialect_options.state_machine_options.new_line == NewLineIdentifier::CARRY_ON) { - result.last_position++; + result.last_position.buffer_pos++; } if (result.number_of_columns == 1) { if (result.null_str_size == 0) { @@ -786,9 +779,8 @@ void StringValueScanner::Initialize() { !state_machine->options.dialect_options.skip_rows.IsSetByUser())) { SetStart(); } - result.last_position = iterator.pos.buffer_pos; - result.current_line_position.begin = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, - cur_buffer_handle->actual_size}; + result.last_position = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, cur_buffer_handle->actual_size}; + result.current_line_position.begin = result.last_position; result.current_line_position.end = result.current_line_position.begin; } @@ -906,12 +898,12 @@ void StringValueScanner::ProcessOverbufferValue() { states.Initialize(); string overbuffer_string; auto previous_buffer = previous_buffer_handle->Ptr(); - if (result.last_position == previous_buffer_handle->actual_size) { - state_machine->Transition(states, previous_buffer[result.last_position - 1]); + if (result.last_position.buffer_pos == previous_buffer_handle->actual_size) { + state_machine->Transition(states, previous_buffer[result.last_position.buffer_pos - 1]); } idx_t j = 0; result.quoted = false; - for (idx_t i = result.last_position; i < previous_buffer_handle->actual_size; i++) { + for (idx_t i = result.last_position.buffer_pos; i < previous_buffer_handle->actual_size; i++) { state_machine->Transition(states, previous_buffer[i]); if (states.EmptyLine() || states.IsCurrentNewRow()) { continue; @@ -995,9 +987,9 @@ void StringValueScanner::ProcessOverbufferValue() { } if (states.IsCarriageReturn() && state_machine->dialect_options.state_machine_options.new_line == NewLineIdentifier::CARRY_ON) { - result.last_position = ++iterator.pos.buffer_pos + 1; + result.last_position = {iterator.pos.buffer_idx, ++iterator.pos.buffer_pos + 1, result.buffer_size}; } else { - result.last_position = ++iterator.pos.buffer_pos; + result.last_position = {iterator.pos.buffer_idx, ++iterator.pos.buffer_pos, result.buffer_size}; } // Be sure to reset the quoted and escaped variables result.quoted = false; @@ -1180,7 +1172,7 @@ void StringValueScanner::SetStart() { scan_finder->previous_buffer_handle->is_last_buffer) { iterator.pos.buffer_idx = scan_finder->iterator.pos.buffer_idx; iterator.pos.buffer_pos = scan_finder->iterator.pos.buffer_pos; - result.last_position = iterator.pos.buffer_pos; + result.last_position = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size}; iterator.done = scan_finder->iterator.done; return; } @@ -1199,7 +1191,7 @@ void StringValueScanner::SetStart() { // If things go terribly wrong, we never loop indefinetly. iterator.pos.buffer_idx = scan_finder->iterator.pos.buffer_idx; iterator.pos.buffer_pos = scan_finder->iterator.pos.buffer_pos; - result.last_position = iterator.pos.buffer_pos; + result.last_position = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size}; iterator.done = scan_finder->iterator.done; return; } @@ -1216,7 +1208,7 @@ void StringValueScanner::SetStart() { } iterator.pos.buffer_idx = scan_finder->result.current_line_position.begin.buffer_idx; iterator.pos.buffer_pos = scan_finder->result.current_line_position.begin.buffer_pos; - result.last_position = iterator.pos.buffer_pos; + result.last_position = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size}; } void StringValueScanner::FinalizeChunkProcess() { diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index dfbe1f581bd5..58b312c75c53 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -92,7 +92,7 @@ class StringValueResult : public ScannerResult { unsafe_vector validity_mask; //! Variables to iterate over the CSV buffers - idx_t last_position; + LinePosition last_position; char *buffer_ptr; idx_t buffer_size; From b0f804b88d6590309272b7677892430eb18f3b2a Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 14 Mar 2024 14:50:09 +0100 Subject: [PATCH 048/147] more adjustments' --- .../scanner/string_value_scanner.cpp | 7 +- .../csv/rejects/csv_unquoted_rejects.test | 66 +++++++++++-------- 2 files changed, 44 insertions(+), 29 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 2b3eaf252a9f..f2339a8f1cb5 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -1249,7 +1249,12 @@ void StringValueScanner::FinalizeChunkProcess() { } else { result.HandleError(); } - iterator.done = FinishedFile(); + if (!iterator.done) { + if (iterator.pos.buffer_pos >= iterator.GetEndPos() || iterator.pos.buffer_idx > iterator.GetBufferIdx() || + FinishedFile()) { + iterator.done = true; + } + } } else { // 2) If a boundary is not set // We read until the chunk is complete, or we have nothing else to read. diff --git a/test/sql/copy/csv/rejects/csv_unquoted_rejects.test b/test/sql/copy/csv/rejects/csv_unquoted_rejects.test index 13c13b8b9fa7..1ce59c7c826e 100644 --- a/test/sql/copy/csv/rejects/csv_unquoted_rejects.test +++ b/test/sql/copy/csv/rejects/csv_unquoted_rejects.test @@ -11,8 +11,7 @@ query II SELECT * FROM read_csv( 'data/csv/rejects/unquoted/basic.csv', columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1, quote = '"', escape = '"'); + store_rejects=true, auto_detect=false, header = 1, quote = '"', escape = '"'); ---- bla 1 bla 2 @@ -21,21 +20,23 @@ bla 1 bla 2 bla 3 -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIII rowsort +SELECT regexp_replace(file_path, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line,line_byte_position, byte_position +FROM reject_scans inner join reject_errors on (reject_scans.scan_id = reject_errors.scan_id and reject_scans.file_id = reject_errors.file_id); ---- -data/csv/rejects/unquoted/basic.csv 5 1 "a" UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 28 +data/csv/rejects/unquoted/basic.csv 5 1 a UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 29 29 + +statement ok +DROP TABLE reject_scans; statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; query II SELECT * FROM read_csv( 'data/csv/rejects/unquoted/unquoted_new_line.csv', columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1, quote = '"', escape = '"'); + store_rejects=true, auto_detect=false, header = 1, quote = '"', escape = '"'); ---- bla 1 bla 2 @@ -44,35 +45,40 @@ bla 1 bla 2 bla 3 -query IIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, byte_position -FROM csv_rejects_table; +query IIIIIII rowsort +SELECT regexp_replace(file_path, '\\', '/', 'g'), line, column_idx, column_name, error_type, line_byte_position,byte_position +FROM reject_scans inner join reject_errors on (reject_scans.scan_id = reject_errors.scan_id and reject_scans.file_id = reject_errors.file_id); ---- -data/csv/rejects/unquoted/unquoted_new_line.csv 5 1 "a" UNQUOTED VALUE 28 +data/csv/rejects/unquoted/unquoted_new_line.csv 5 1 a UNQUOTED VALUE 29 29 + +statement ok +DROP TABLE reject_scans; statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; query I SELECT * FROM read_csv( 'data/csv/rejects/unquoted/unquoted_last_value.csv', columns = {'a': 'VARCHAR'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 0, quote = '"', escape = '"'); + store_rejects=true, auto_detect=false, header = 0, quote = '"', escape = '"'); ---- blaaaaaaaaaaaaaa bla bla bla -query IIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, byte_position -FROM csv_rejects_table; +query IIIIIIII rowsort +SELECT regexp_replace(file_path, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line,line_byte_position, byte_position +FROM reject_scans inner join reject_errors on (reject_scans.scan_id = reject_errors.scan_id and reject_scans.file_id = reject_errors.file_id); ---- -data/csv/rejects/unquoted/unquoted_last_value.csv 5 1 "a" UNQUOTED VALUE 31 +data/csv/rejects/unquoted/unquoted_last_value.csv 5 1 a UNQUOTED VALUE "bla 38 38 statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_scans; + +statement ok +DROP TABLE reject_errors; loop buffer_size 35 40 @@ -80,8 +86,8 @@ query II SELECT * FROM read_csv( 'data/csv/rejects/unquoted/basic.csv', columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, - rejects_table='csv_rejects_table', buffer_size=${buffer_size}, - ignore_errors=true, auto_detect=false, header = 1, quote = '"', escape = '"', buffer_size=35); + buffer_size=${buffer_size}, + store_rejects=true, auto_detect=false, header = 1, quote = '"', escape = '"'); ---- bla 1 bla 2 @@ -90,13 +96,17 @@ bla 1 bla 2 bla 3 -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; + +query IIIIIIII rowsort +SELECT regexp_replace(file_path, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line,line_byte_position, byte_position +FROM reject_scans inner join reject_errors on (reject_scans.scan_id = reject_errors.scan_id and reject_scans.file_id = reject_errors.file_id); ---- -data/csv/rejects/unquoted/basic.csv 5 1 "a" UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 28 +data/csv/rejects/unquoted/basic.csv 5 1 a UNQUOTED VALUE "blaaaaaaaaaaaaaa"bla,4 29 29 + +statement ok +DROP TABLE reject_scans; statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; endloop \ No newline at end of file From a2a9982e7e619661f9fa7abc9e10fe87cb5dcc74 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 20 Mar 2024 11:26:53 +0100 Subject: [PATCH 049/147] Fix progress over multiple very large files --- .../table_function/global_csv_state.cpp | 19 ++++++++++--------- test/sql/copy/csv/test_gzipped.test | 0 2 files changed, 10 insertions(+), 9 deletions(-) create mode 100644 test/sql/copy/csv/test_gzipped.test diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 863b937f2186..72707df66324 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -43,16 +43,16 @@ double CSVGlobalState::GetProgress(const ReadCSVData &bind_data_p) const { lock_guard parallel_lock(main_mutex); idx_t total_files = bind_data.files.size(); // get the progress WITHIN the current file - double progress; + double percentage = 0; if (file_scans.back()->file_size == 0) { - progress = 1.0; + percentage = 1.0; } else { // for compressed files, readed bytes may greater than files size. - progress = std::min(1.0, double(file_scans.back()->bytes_read) / double(file_scans.back()->file_size)); + for (auto &file : file_scans) { + percentage += + (double(1) / double(total_files)) * std::min(1.0, double(file->bytes_read) / double(file->file_size)); + } } - // now get the total percentage of files read - double percentage = double(current_boundary.GetFileIdx()) / total_files; - percentage += (double(1) / double(total_files)) * progress; return percentage * 100; } @@ -66,8 +66,9 @@ unique_ptr CSVGlobalState::Next() { if (cur_idx == 0) { current_file = file_scans.back(); } else { - current_file = make_shared(context, bind_data.files[cur_idx], bind_data.options, cur_idx, - bind_data, column_ids, file_schema); + file_scans.emplace_back(make_shared(context, bind_data.files[cur_idx], bind_data.options, + cur_idx, bind_data, column_ids, file_schema)); + current_file = file_scans.back(); } auto csv_scanner = make_uniq(scanner_idx++, current_file->buffer_manager, current_file->state_machine, @@ -98,7 +99,7 @@ unique_ptr CSVGlobalState::Next() { // If we have a next file we have to construct the file scan for that file_scans.emplace_back(make_shared(context, bind_data.files[current_file_idx], bind_data.options, current_file_idx, bind_data, column_ids, - file_schema)); + file_schema, single_threaded)); // And re-start the boundary-iterator auto buffer_size = file_scans.back()->buffer_manager->GetBuffer(0)->actual_size; current_boundary = CSVIterator(current_file_idx, 0, 0, 0, buffer_size); diff --git a/test/sql/copy/csv/test_gzipped.test b/test/sql/copy/csv/test_gzipped.test new file mode 100644 index 000000000000..e69de29bb2d1 From 3ab61710c8dbc6889c4299e61e45ed42368af097 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 20 Mar 2024 12:52:58 +0100 Subject: [PATCH 050/147] Dont store buffers if doing single threaded scan over multiple files --- .../csv_scanner/buffer_manager/csv_buffer.cpp | 20 +++++++++---------- .../buffer_manager/csv_buffer_manager.cpp | 14 +++++++++---- .../scanner/string_value_scanner.cpp | 4 ++-- .../table_function/csv_file_scanner.cpp | 9 +++++---- .../table_function/global_csv_state.cpp | 7 +++++-- src/function/table/copy_csv.cpp | 2 +- src/function/table/read_csv.cpp | 2 +- src/function/table/sniff_csv.cpp | 2 +- .../operator/csv_scanner/csv_buffer.hpp | 9 +++++---- .../csv_scanner/csv_buffer_manager.hpp | 5 ++++- .../operator/csv_scanner/csv_file_scanner.hpp | 5 +++-- src/main/relation/read_csv_relation.cpp | 2 +- 12 files changed, 48 insertions(+), 33 deletions(-) diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp index 8c29ae79fb43..6ac66783f041 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp @@ -4,9 +4,9 @@ namespace duckdb { CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle, - idx_t &global_csv_current_position, idx_t file_number_p) + idx_t &global_csv_current_position, idx_t file_number_p, bool single_threaded) : context(context), file_number(file_number_p), can_seek(file_handle.CanSeek()) { - AllocateBuffer(buffer_size_p); + AllocateBuffer(buffer_size_p, can_seek || single_threaded); auto buffer = Ptr(); actual_buffer_size = file_handle.Read(buffer, buffer_size_p); while (actual_buffer_size < buffer_size_p && !file_handle.FinishedReading()) { @@ -18,10 +18,10 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle } CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t buffer_size, - idx_t global_csv_current_position, idx_t file_number_p, idx_t buffer_idx_p) + idx_t global_csv_current_position, idx_t file_number_p, idx_t buffer_idx_p, bool single_threaded) : context(context), global_csv_start(global_csv_current_position), file_number(file_number_p), can_seek(file_handle.CanSeek()), buffer_idx(buffer_idx_p) { - AllocateBuffer(buffer_size); + AllocateBuffer(buffer_size, single_threaded || can_seek); auto buffer = handle.Ptr(); actual_buffer_size = file_handle.Read(handle.Ptr(), buffer_size); while (actual_buffer_size < buffer_size && !file_handle.FinishedReading()) { @@ -32,15 +32,16 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b } shared_ptr CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p, - bool &has_seaked) { + bool &has_seaked, bool single_threaded) { if (has_seaked) { // This means that at some point a reload was done, and we are currently on the incorrect position in our file // handle file_handle.Seek(global_csv_start + actual_buffer_size); has_seaked = false; } - auto next_csv_buffer = make_shared(file_handle, context, buffer_size, - global_csv_start + actual_buffer_size, file_number_p, buffer_idx + 1); + auto next_csv_buffer = + make_shared(file_handle, context, buffer_size, global_csv_start + actual_buffer_size, file_number_p, + buffer_idx + 1, single_threaded); if (next_csv_buffer->GetBufferSize() == 0) { // We are done reading return nullptr; @@ -48,9 +49,8 @@ shared_ptr CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_s return next_csv_buffer; } -void CSVBuffer::AllocateBuffer(idx_t buffer_size) { +void CSVBuffer::AllocateBuffer(idx_t buffer_size, bool can_destroy) { auto &buffer_manager = BufferManager::GetBufferManager(context); - bool can_destroy = can_seek; handle = buffer_manager.Allocate(MemoryTag::CSV_READER, MaxValue(Storage::BLOCK_SIZE, buffer_size), can_destroy, &block); } @@ -60,7 +60,7 @@ idx_t CSVBuffer::GetBufferSize() { } void CSVBuffer::Reload(CSVFileHandle &file_handle) { - AllocateBuffer(actual_buffer_size); + AllocateBuffer(actual_buffer_size, false); file_handle.Seek(global_csv_start); file_handle.Read(handle.Ptr(), actual_buffer_size); } diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp index 2a13158b6081..568343cafad7 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp @@ -4,8 +4,9 @@ namespace duckdb { CSVBufferManager::CSVBufferManager(ClientContext &context_p, const CSVReaderOptions &options, const string &file_path_p, - const idx_t file_idx_p) - : context(context_p), file_idx(file_idx_p), file_path(file_path_p), buffer_size(CSVBuffer::CSV_BUFFER_SIZE) { + const idx_t file_idx_p, bool single_threaded_p) + : context(context_p), file_idx(file_idx_p), file_path(file_path_p), buffer_size(CSVBuffer::CSV_BUFFER_SIZE), + single_threaded(single_threaded_p) { D_ASSERT(!file_path.empty()); file_handle = ReadCSV::OpenCSV(file_path, options.compression, context); skip_rows = options.dialect_options.skip_rows.GetValue(); @@ -28,7 +29,7 @@ void CSVBufferManager::UnpinBuffer(const idx_t cache_idx) { void CSVBufferManager::Initialize() { if (cached_buffers.empty()) { cached_buffers.emplace_back( - make_shared(context, buffer_size, *file_handle, global_csv_pos, file_idx)); + make_shared(context, buffer_size, *file_handle, global_csv_pos, file_idx, single_threaded)); last_buffer = cached_buffers.front(); } } @@ -47,7 +48,8 @@ bool CSVBufferManager::ReadNextAndCacheIt() { last_buffer->last_buffer = true; return false; } - auto maybe_last_buffer = last_buffer->Next(*file_handle, cur_buffer_size, file_idx, has_seeked); + auto maybe_last_buffer = + last_buffer->Next(*file_handle, cur_buffer_size, file_idx, has_seeked, single_threaded); if (!maybe_last_buffer) { last_buffer->last_buffer = true; return false; @@ -126,4 +128,8 @@ string CSVBufferManager::GetFilePath() { return file_path; } +void CSVBufferManager::SetSingleThreaded() { + single_threaded = true; +} + } // namespace duckdb diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 9582e1c1af2f..a0376cc94947 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -530,9 +530,9 @@ unique_ptr StringValueScanner::GetCSVScanner(ClientContext & state_machine->dialect_options.num_cols = options.dialect_options.num_cols; state_machine->dialect_options.header = options.dialect_options.header; - auto buffer_manager = make_shared(context, options, options.file_path, 0); + auto buffer_manager = make_shared(context, options, options.file_path, 0, false); auto scanner = make_uniq(buffer_manager, state_machine, make_shared()); - scanner->csv_file_scan = make_shared(context, options.file_path, options); + scanner->csv_file_scan = make_shared(context, options.file_path, options, false); scanner->csv_file_scan->InitializeProjection(); return scanner; } diff --git a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp index 0532fc678a41..8013a2da10ef 100644 --- a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp +++ b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp @@ -41,7 +41,7 @@ CSVFileScan::CSVFileScan(ClientContext &context, shared_ptr bu CSVFileScan::CSVFileScan(ClientContext &context, const string &file_path_p, const CSVReaderOptions &options_p, const idx_t file_idx_p, const ReadCSVData &bind_data, const vector &column_ids, - const vector &file_schema) + const vector &file_schema, bool single_threaded) : file_path(file_path_p), file_idx(file_idx_p), error_handler(make_shared(options_p.ignore_errors)), options(options_p) { if (file_idx < bind_data.union_readers.size()) { @@ -73,7 +73,7 @@ CSVFileScan::CSVFileScan(ClientContext &context, const string &file_path_p, cons } // Initialize Buffer Manager - buffer_manager = make_shared(context, options, file_path, file_idx); + buffer_manager = make_shared(context, options, file_path, file_idx, single_threaded); // Initialize On Disk and Size of file on_disk_file = buffer_manager->file_handle->OnDiskFile(); file_size = buffer_manager->file_handle->FileSize(); @@ -128,10 +128,11 @@ CSVFileScan::CSVFileScan(ClientContext &context, const string &file_path_p, cons InitializeFileNamesTypes(); } -CSVFileScan::CSVFileScan(ClientContext &context, const string &file_name, CSVReaderOptions &options_p) +CSVFileScan::CSVFileScan(ClientContext &context, const string &file_name, CSVReaderOptions &options_p, + bool single_threaded) : file_path(file_name), file_idx(0), error_handler(make_shared(options_p.ignore_errors)), options(options_p) { - buffer_manager = make_shared(context, options, file_path, file_idx); + buffer_manager = make_shared(context, options, file_path, file_idx, single_threaded); // Initialize On Disk and Size of file on_disk_file = buffer_manager->file_handle->OnDiskFile(); file_size = buffer_manager->file_handle->FileSize(); diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 72707df66324..f0ec98fbbdfb 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -22,7 +22,7 @@ CSVGlobalState::CSVGlobalState(ClientContext &context_p, const shared_ptr(context, files[0], options, 0, bind_data, column_ids, file_schema)); + make_uniq(context, files[0], options, 0, bind_data, column_ids, file_schema, single_threaded)); }; //! There are situations where we only support single threaded scanning bool many_csv_files = files.size() > 1 && files.size() > system_threads * 2; @@ -65,9 +65,12 @@ unique_ptr CSVGlobalState::Next() { shared_ptr current_file; if (cur_idx == 0) { current_file = file_scans.back(); + current_file->buffer_manager->SetSingleThreaded(); } else { + lock_guard parallel_lock(main_mutex); file_scans.emplace_back(make_shared(context, bind_data.files[cur_idx], bind_data.options, - cur_idx, bind_data, column_ids, file_schema)); + cur_idx, bind_data, column_ids, file_schema, + single_threaded)); current_file = file_scans.back(); } auto csv_scanner = diff --git a/src/function/table/copy_csv.cpp b/src/function/table/copy_csv.cpp index e2f9a2403c08..67e8041f4da6 100644 --- a/src/function/table/copy_csv.cpp +++ b/src/function/table/copy_csv.cpp @@ -156,7 +156,7 @@ static unique_ptr ReadCSVBind(ClientContext &context, CopyInfo &in } if (options.auto_detect) { - auto buffer_manager = make_shared(context, options, bind_data->files[0], 0); + auto buffer_manager = make_shared(context, options, bind_data->files[0], 0, false); CSVSniffer sniffer(options, buffer_manager, CSVStateMachineCache::Get(context), {&expected_types, &expected_names}); sniffer.SniffCSV(); diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index 8d2e1be0d780..0963f35c0478 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -98,7 +98,7 @@ static unique_ptr ReadCSVBind(ClientContext &context, TableFunctio } if (options.auto_detect && !options.file_options.union_by_name) { options.file_path = result->files[0]; - result->buffer_manager = make_shared(context, options, result->files[0], 0); + result->buffer_manager = make_shared(context, options, result->files[0], 0, false); CSVSniffer sniffer(options, result->buffer_manager, CSVStateMachineCache::Get(context), {&return_types, &names}); auto sniffer_result = sniffer.SniffCSV(); diff --git a/src/function/table/sniff_csv.cpp b/src/function/table/sniff_csv.cpp index f135b15c615d..b776288d6f27 100644 --- a/src/function/table/sniff_csv.cpp +++ b/src/function/table/sniff_csv.cpp @@ -120,7 +120,7 @@ static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p, auto sniffer_options = data.options; sniffer_options.file_path = data.path; - auto buffer_manager = make_shared(context, sniffer_options, sniffer_options.file_path, 0); + auto buffer_manager = make_shared(context, sniffer_options, sniffer_options.file_path, 0, false); CSVSniffer sniffer(sniffer_options, buffer_manager, CSVStateMachineCache::Get(context)); auto sniffer_result = sniffer.SniffCSV(true); string str_opt; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp index 72665ae2de54..8200da88e32a 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp @@ -44,14 +44,15 @@ class CSVBuffer { public: //! Constructor for Initial Buffer CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle, - idx_t &global_csv_current_position, idx_t file_number); + idx_t &global_csv_current_position, idx_t file_number, bool single_threaded); //! Constructor for `Next()` Buffers CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t buffer_size, idx_t global_csv_current_position, - idx_t file_number_p, idx_t buffer_idx); + idx_t file_number_p, idx_t buffer_idx, bool single_threaded); //! Creates a new buffer with the next part of the CSV File - shared_ptr Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number, bool &has_seaked); + shared_ptr Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number, bool &has_seaked, + bool single_threaded); //! Gets the buffer actual size idx_t GetBufferSize(); @@ -60,7 +61,7 @@ class CSVBuffer { bool IsCSVFileLastBuffer(); //! Allocates internal buffer, sets 'block' and 'handle' variables. - void AllocateBuffer(idx_t buffer_size); + void AllocateBuffer(idx_t buffer_size, bool can_destroy); void Reload(CSVFileHandle &file_handle); //! Wrapper for the Pin Function, if it can seek, it means that the buffer might have been destroyed, hence we must diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp index b9b4bb92d372..a1127882a718 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp @@ -22,7 +22,7 @@ class CSVStateMachine; class CSVBufferManager { public: CSVBufferManager(ClientContext &context, const CSVReaderOptions &options, const string &file_path, - const idx_t file_idx); + const idx_t file_idx, bool single_threaded); //! Returns a buffer from a buffer id (starting from 0). If it's in the auto-detection then we cache new buffers //! Otherwise we remove them from the cache if they are already there, or just return them bypassing the cache. shared_ptr GetBuffer(const idx_t buffer_idx); @@ -44,6 +44,8 @@ class CSVBufferManager { string GetFilePath(); + void SetSingleThreaded(); + ClientContext &context; idx_t skip_rows = 0; @@ -69,6 +71,7 @@ class CSVBufferManager { //! If the file_handle used seek bool has_seeked = false; unordered_set reset_when_possible; + bool single_threaded; }; } // namespace duckdb diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp index ce9fc08ce0bd..ed859238d9ef 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp @@ -27,9 +27,10 @@ class CSVFileScan { //! Path to this file CSVFileScan(ClientContext &context, const string &file_path, const CSVReaderOptions &options, const idx_t file_idx, const ReadCSVData &bind_data, const vector &column_ids, - const vector &file_schema); + const vector &file_schema, bool single_threaded); - CSVFileScan(ClientContext &context, const string &file_name, CSVReaderOptions &options); + CSVFileScan(ClientContext &context, const string &file_name, CSVReaderOptions &options, + bool single_threaded = false); const string &GetFileName(); const vector &GetNames(); diff --git a/src/main/relation/read_csv_relation.cpp b/src/main/relation/read_csv_relation.cpp index 1500720e0069..5d0b52e5c96d 100644 --- a/src/main/relation/read_csv_relation.cpp +++ b/src/main/relation/read_csv_relation.cpp @@ -56,7 +56,7 @@ ReadCSVRelation::ReadCSVRelation(const std::shared_ptr &context, shared_ptr buffer_manager; context->RunFunctionInTransaction([&]() { - buffer_manager = make_shared(*context, csv_options, files[0], 0); + buffer_manager = make_shared(*context, csv_options, files[0], 0, false); CSVSniffer sniffer(csv_options, buffer_manager, CSVStateMachineCache::Get(*context)); auto sniffer_result = sniffer.SniffCSV(); auto &types = sniffer_result.return_types; From 8db82b0d018f0694e13cfd4ed5a2b5d6ce3edde9 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 20 Mar 2024 15:44:24 +0100 Subject: [PATCH 051/147] bad file --- test/sql/copy/csv/test_gzipped.test | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test/sql/copy/csv/test_gzipped.test diff --git a/test/sql/copy/csv/test_gzipped.test b/test/sql/copy/csv/test_gzipped.test deleted file mode 100644 index e69de29bb2d1..000000000000 From d996c085f2f5da8b296137d9e74d8cd2233f9d5b Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 25 Mar 2024 12:34:39 +0100 Subject: [PATCH 052/147] Restore old bm --- .../csv_scanner/buffer_manager/csv_buffer.cpp | 27 ++++++++++--------- .../buffer_manager/csv_buffer_manager.cpp | 14 +++------- .../buffer_manager/csv_file_handle.cpp | 12 ++++++++- .../scanner/string_value_scanner.cpp | 2 +- .../table_function/csv_file_scanner.cpp | 4 +-- .../table_function/global_csv_state.cpp | 1 - src/function/table/copy_csv.cpp | 2 +- src/function/table/read_csv.cpp | 2 +- src/function/table/sniff_csv.cpp | 2 +- .../operator/csv_scanner/csv_buffer.hpp | 12 ++++----- .../csv_scanner/csv_buffer_manager.hpp | 5 +--- .../operator/csv_scanner/csv_file_handle.hpp | 3 +++ src/main/relation/read_csv_relation.cpp | 2 +- 13 files changed, 46 insertions(+), 42 deletions(-) diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp index 6ac66783f041..aaafd21331c6 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp @@ -4,9 +4,9 @@ namespace duckdb { CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle, - idx_t &global_csv_current_position, idx_t file_number_p, bool single_threaded) - : context(context), file_number(file_number_p), can_seek(file_handle.CanSeek()) { - AllocateBuffer(buffer_size_p, can_seek || single_threaded); + idx_t &global_csv_current_position, idx_t file_number_p) + : context(context), file_number(file_number_p), can_seek(file_handle.CanSeek()), is_pipe(file_handle.OnDiskFile()) { + AllocateBuffer(buffer_size_p); auto buffer = Ptr(); actual_buffer_size = file_handle.Read(buffer, buffer_size_p); while (actual_buffer_size < buffer_size_p && !file_handle.FinishedReading()) { @@ -18,10 +18,10 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle } CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t buffer_size, - idx_t global_csv_current_position, idx_t file_number_p, idx_t buffer_idx_p, bool single_threaded) + idx_t global_csv_current_position, idx_t file_number_p, idx_t buffer_idx_p) : context(context), global_csv_start(global_csv_current_position), file_number(file_number_p), - can_seek(file_handle.CanSeek()), buffer_idx(buffer_idx_p) { - AllocateBuffer(buffer_size, single_threaded || can_seek); + can_seek(file_handle.CanSeek()), is_pipe(file_handle.OnDiskFile()), buffer_idx(buffer_idx_p) { + AllocateBuffer(buffer_size); auto buffer = handle.Ptr(); actual_buffer_size = file_handle.Read(handle.Ptr(), buffer_size); while (actual_buffer_size < buffer_size && !file_handle.FinishedReading()) { @@ -32,16 +32,15 @@ CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t b } shared_ptr CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number_p, - bool &has_seaked, bool single_threaded) { + bool &has_seaked) { if (has_seaked) { // This means that at some point a reload was done, and we are currently on the incorrect position in our file // handle file_handle.Seek(global_csv_start + actual_buffer_size); has_seaked = false; } - auto next_csv_buffer = - make_shared(file_handle, context, buffer_size, global_csv_start + actual_buffer_size, file_number_p, - buffer_idx + 1, single_threaded); + auto next_csv_buffer = make_shared(file_handle, context, buffer_size, + global_csv_start + actual_buffer_size, file_number_p, buffer_idx + 1); if (next_csv_buffer->GetBufferSize() == 0) { // We are done reading return nullptr; @@ -49,8 +48,9 @@ shared_ptr CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_s return next_csv_buffer; } -void CSVBuffer::AllocateBuffer(idx_t buffer_size, bool can_destroy) { +void CSVBuffer::AllocateBuffer(idx_t buffer_size) { auto &buffer_manager = BufferManager::GetBufferManager(context); + bool can_destroy = !is_pipe; handle = buffer_manager.Allocate(MemoryTag::CSV_READER, MaxValue(Storage::BLOCK_SIZE, buffer_size), can_destroy, &block); } @@ -60,14 +60,15 @@ idx_t CSVBuffer::GetBufferSize() { } void CSVBuffer::Reload(CSVFileHandle &file_handle) { - AllocateBuffer(actual_buffer_size, false); + AllocateBuffer(actual_buffer_size); + // If we can seek, we seek and return the correct pointers file_handle.Seek(global_csv_start); file_handle.Read(handle.Ptr(), actual_buffer_size); } shared_ptr CSVBuffer::Pin(CSVFileHandle &file_handle, bool &has_seeked) { auto &buffer_manager = BufferManager::GetBufferManager(context); - if (can_seek && block->IsUnloaded()) { + if (is_pipe && block->IsUnloaded()) { // We have to reload it from disk block = nullptr; Reload(file_handle); diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp index 568343cafad7..2a13158b6081 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp @@ -4,9 +4,8 @@ namespace duckdb { CSVBufferManager::CSVBufferManager(ClientContext &context_p, const CSVReaderOptions &options, const string &file_path_p, - const idx_t file_idx_p, bool single_threaded_p) - : context(context_p), file_idx(file_idx_p), file_path(file_path_p), buffer_size(CSVBuffer::CSV_BUFFER_SIZE), - single_threaded(single_threaded_p) { + const idx_t file_idx_p) + : context(context_p), file_idx(file_idx_p), file_path(file_path_p), buffer_size(CSVBuffer::CSV_BUFFER_SIZE) { D_ASSERT(!file_path.empty()); file_handle = ReadCSV::OpenCSV(file_path, options.compression, context); skip_rows = options.dialect_options.skip_rows.GetValue(); @@ -29,7 +28,7 @@ void CSVBufferManager::UnpinBuffer(const idx_t cache_idx) { void CSVBufferManager::Initialize() { if (cached_buffers.empty()) { cached_buffers.emplace_back( - make_shared(context, buffer_size, *file_handle, global_csv_pos, file_idx, single_threaded)); + make_shared(context, buffer_size, *file_handle, global_csv_pos, file_idx)); last_buffer = cached_buffers.front(); } } @@ -48,8 +47,7 @@ bool CSVBufferManager::ReadNextAndCacheIt() { last_buffer->last_buffer = true; return false; } - auto maybe_last_buffer = - last_buffer->Next(*file_handle, cur_buffer_size, file_idx, has_seeked, single_threaded); + auto maybe_last_buffer = last_buffer->Next(*file_handle, cur_buffer_size, file_idx, has_seeked); if (!maybe_last_buffer) { last_buffer->last_buffer = true; return false; @@ -128,8 +126,4 @@ string CSVBufferManager::GetFilePath() { return file_path; } -void CSVBufferManager::SetSingleThreaded() { - single_threaded = true; -} - } // namespace duckdb diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp index cbb1c1cd86e7..cf4bf9fafdf2 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp @@ -9,6 +9,7 @@ CSVFileHandle::CSVFileHandle(FileSystem &fs, Allocator &allocator, unique_ptrCanSeek(); on_disk_file = file_handle->OnDiskFile(); file_size = file_handle->GetFileSize(); + is_pipe = file_handle->IsPipe(); uncompressed = compression == FileCompressionType::UNCOMPRESSED; } @@ -33,7 +34,12 @@ bool CSVFileHandle::CanSeek() { void CSVFileHandle::Seek(idx_t position) { if (!can_seek) { - throw InternalException("Cannot seek in this file"); + if (is_pipe) { + throw InternalException("Can't reconstruct the buffer from a on disk file."); + } + //! If we can't seek in this file, we reset it and re-read up to the necessary point. + file_handle->Reset(); + // file_handle->Read(); } file_handle->Seek(position); } @@ -42,6 +48,10 @@ bool CSVFileHandle::OnDiskFile() { return on_disk_file; } +bool CSVFileHandle::IsPipe() { + return is_pipe; +} + idx_t CSVFileHandle::FileSize() { return file_size; } diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index a0376cc94947..b1b2afa39c98 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -530,7 +530,7 @@ unique_ptr StringValueScanner::GetCSVScanner(ClientContext & state_machine->dialect_options.num_cols = options.dialect_options.num_cols; state_machine->dialect_options.header = options.dialect_options.header; - auto buffer_manager = make_shared(context, options, options.file_path, 0, false); + auto buffer_manager = make_shared(context, options, options.file_path, 0); auto scanner = make_uniq(buffer_manager, state_machine, make_shared()); scanner->csv_file_scan = make_shared(context, options.file_path, options, false); scanner->csv_file_scan->InitializeProjection(); diff --git a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp index 8013a2da10ef..7d975221c3ac 100644 --- a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp +++ b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp @@ -73,7 +73,7 @@ CSVFileScan::CSVFileScan(ClientContext &context, const string &file_path_p, cons } // Initialize Buffer Manager - buffer_manager = make_shared(context, options, file_path, file_idx, single_threaded); + buffer_manager = make_shared(context, options, file_path, file_idx); // Initialize On Disk and Size of file on_disk_file = buffer_manager->file_handle->OnDiskFile(); file_size = buffer_manager->file_handle->FileSize(); @@ -132,7 +132,7 @@ CSVFileScan::CSVFileScan(ClientContext &context, const string &file_name, CSVRea bool single_threaded) : file_path(file_name), file_idx(0), error_handler(make_shared(options_p.ignore_errors)), options(options_p) { - buffer_manager = make_shared(context, options, file_path, file_idx, single_threaded); + buffer_manager = make_shared(context, options, file_path, file_idx); // Initialize On Disk and Size of file on_disk_file = buffer_manager->file_handle->OnDiskFile(); file_size = buffer_manager->file_handle->FileSize(); diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index f0ec98fbbdfb..524482e0de55 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -65,7 +65,6 @@ unique_ptr CSVGlobalState::Next() { shared_ptr current_file; if (cur_idx == 0) { current_file = file_scans.back(); - current_file->buffer_manager->SetSingleThreaded(); } else { lock_guard parallel_lock(main_mutex); file_scans.emplace_back(make_shared(context, bind_data.files[cur_idx], bind_data.options, diff --git a/src/function/table/copy_csv.cpp b/src/function/table/copy_csv.cpp index 67e8041f4da6..e2f9a2403c08 100644 --- a/src/function/table/copy_csv.cpp +++ b/src/function/table/copy_csv.cpp @@ -156,7 +156,7 @@ static unique_ptr ReadCSVBind(ClientContext &context, CopyInfo &in } if (options.auto_detect) { - auto buffer_manager = make_shared(context, options, bind_data->files[0], 0, false); + auto buffer_manager = make_shared(context, options, bind_data->files[0], 0); CSVSniffer sniffer(options, buffer_manager, CSVStateMachineCache::Get(context), {&expected_types, &expected_names}); sniffer.SniffCSV(); diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index 0963f35c0478..8d2e1be0d780 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -98,7 +98,7 @@ static unique_ptr ReadCSVBind(ClientContext &context, TableFunctio } if (options.auto_detect && !options.file_options.union_by_name) { options.file_path = result->files[0]; - result->buffer_manager = make_shared(context, options, result->files[0], 0, false); + result->buffer_manager = make_shared(context, options, result->files[0], 0); CSVSniffer sniffer(options, result->buffer_manager, CSVStateMachineCache::Get(context), {&return_types, &names}); auto sniffer_result = sniffer.SniffCSV(); diff --git a/src/function/table/sniff_csv.cpp b/src/function/table/sniff_csv.cpp index d27817f818eb..3e859a65afe3 100644 --- a/src/function/table/sniff_csv.cpp +++ b/src/function/table/sniff_csv.cpp @@ -120,7 +120,7 @@ static void CSVSniffFunction(ClientContext &context, TableFunctionInput &data_p, auto sniffer_options = data.options; sniffer_options.file_path = data.path; - auto buffer_manager = make_shared(context, sniffer_options, sniffer_options.file_path, 0, false); + auto buffer_manager = make_shared(context, sniffer_options, sniffer_options.file_path, 0); if (sniffer_options.name_list.empty()) { sniffer_options.name_list = data.names_csv; } diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp index 8200da88e32a..a5a90d763e06 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer.hpp @@ -44,15 +44,14 @@ class CSVBuffer { public: //! Constructor for Initial Buffer CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle, - idx_t &global_csv_current_position, idx_t file_number, bool single_threaded); + idx_t &global_csv_current_position, idx_t file_number); //! Constructor for `Next()` Buffers CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t buffer_size, idx_t global_csv_current_position, - idx_t file_number_p, idx_t buffer_idx, bool single_threaded); + idx_t file_number_p, idx_t buffer_idx); //! Creates a new buffer with the next part of the CSV File - shared_ptr Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number, bool &has_seaked, - bool single_threaded); + shared_ptr Next(CSVFileHandle &file_handle, idx_t buffer_size, idx_t file_number, bool &has_seaked); //! Gets the buffer actual size idx_t GetBufferSize(); @@ -61,7 +60,7 @@ class CSVBuffer { bool IsCSVFileLastBuffer(); //! Allocates internal buffer, sets 'block' and 'handle' variables. - void AllocateBuffer(idx_t buffer_size, bool can_destroy); + void AllocateBuffer(idx_t buffer_size); void Reload(CSVFileHandle &file_handle); //! Wrapper for the Pin Function, if it can seek, it means that the buffer might have been destroyed, hence we must @@ -92,8 +91,9 @@ class CSVBuffer { //! Number of the file that is in this buffer idx_t file_number = 0; //! If we can seek in the file or not. - //! If we can't seek, this means we can't destroy the buffers bool can_seek; + //! If this file is being fed by a pipe. + bool is_pipe; //! Buffer Index, used as a batch index for insertion-order preservation idx_t buffer_idx = 0; //! -------- Allocated Block ---------// diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp index a1127882a718..b9b4bb92d372 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp @@ -22,7 +22,7 @@ class CSVStateMachine; class CSVBufferManager { public: CSVBufferManager(ClientContext &context, const CSVReaderOptions &options, const string &file_path, - const idx_t file_idx, bool single_threaded); + const idx_t file_idx); //! Returns a buffer from a buffer id (starting from 0). If it's in the auto-detection then we cache new buffers //! Otherwise we remove them from the cache if they are already there, or just return them bypassing the cache. shared_ptr GetBuffer(const idx_t buffer_idx); @@ -44,8 +44,6 @@ class CSVBufferManager { string GetFilePath(); - void SetSingleThreaded(); - ClientContext &context; idx_t skip_rows = 0; @@ -71,7 +69,6 @@ class CSVBufferManager { //! If the file_handle used seek bool has_seeked = false; unordered_set reset_when_possible; - bool single_threaded; }; } // namespace duckdb diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp index c7e70b008ae7..95a3cc6dafc9 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp @@ -28,6 +28,7 @@ struct CSVFileHandle { bool CanSeek(); void Seek(idx_t position); bool OnDiskFile(); + bool IsPipe(); idx_t FileSize(); @@ -50,6 +51,8 @@ struct CSVFileHandle { string path; bool can_seek = false; bool on_disk_file = false; + bool is_pipe = false; + idx_t file_size = 0; idx_t requested_bytes = 0; diff --git a/src/main/relation/read_csv_relation.cpp b/src/main/relation/read_csv_relation.cpp index 5d0b52e5c96d..1500720e0069 100644 --- a/src/main/relation/read_csv_relation.cpp +++ b/src/main/relation/read_csv_relation.cpp @@ -56,7 +56,7 @@ ReadCSVRelation::ReadCSVRelation(const std::shared_ptr &context, shared_ptr buffer_manager; context->RunFunctionInTransaction([&]() { - buffer_manager = make_shared(*context, csv_options, files[0], 0, false); + buffer_manager = make_shared(*context, csv_options, files[0], 0); CSVSniffer sniffer(csv_options, buffer_manager, CSVStateMachineCache::Get(*context)); auto sniffer_result = sniffer.SniffCSV(); auto &types = sniffer_result.return_types; From 03cb16a6ca02a84bebea5448f904ff75ffe39312 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 25 Mar 2024 16:41:31 +0100 Subject: [PATCH 053/147] Not store buffers from gzipped files, reset buffer manager after sniffing, implement brute force seeking for gzipped files --- .../csv_scanner/buffer_manager/csv_buffer.cpp | 4 ++-- .../buffer_manager/csv_buffer_manager.cpp | 13 +++++++++++++ .../buffer_manager/csv_file_handle.cpp | 17 ++++++++++++++--- .../csv_scanner/sniffer/csv_sniffer.cpp | 2 ++ .../operator/csv_scanner/csv_buffer_manager.hpp | 1 + .../operator/csv_scanner/csv_file_handle.hpp | 4 +++- 6 files changed, 35 insertions(+), 6 deletions(-) diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp index aaafd21331c6..2c85e0ee2924 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp @@ -36,7 +36,7 @@ shared_ptr CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_s if (has_seaked) { // This means that at some point a reload was done, and we are currently on the incorrect position in our file // handle - file_handle.Seek(global_csv_start + actual_buffer_size); + file_handle.Seek(handle.Ptr(), actual_buffer_size, global_csv_start + actual_buffer_size); has_seaked = false; } auto next_csv_buffer = make_shared(file_handle, context, buffer_size, @@ -62,7 +62,7 @@ idx_t CSVBuffer::GetBufferSize() { void CSVBuffer::Reload(CSVFileHandle &file_handle) { AllocateBuffer(actual_buffer_size); // If we can seek, we seek and return the correct pointers - file_handle.Seek(global_csv_start); + file_handle.Seek(handle.Ptr(), actual_buffer_size, global_csv_start); file_handle.Read(handle.Ptr(), actual_buffer_size); } diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp index 2a13158b6081..66a6e6ab3cec 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer_manager.cpp @@ -122,6 +122,19 @@ bool CSVBufferManager::Done() { return done; } +void CSVBufferManager::ResetBufferManager() { + if (!file_handle->IsPipe()) { + // If this is not a pipe we reset the buffer manager and restart it when doing the actual scan + cached_buffers.clear(); + reset_when_possible.clear(); + file_handle->Reset(); + last_buffer = nullptr; + done = false; + global_csv_pos = 0; + Initialize(); + } +} + string CSVBufferManager::GetFilePath() { return file_path; } diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp index cf4bf9fafdf2..528246d8119d 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp @@ -32,14 +32,19 @@ bool CSVFileHandle::CanSeek() { return can_seek; } -void CSVFileHandle::Seek(idx_t position) { +void CSVFileHandle::Seek(void *buffer, idx_t nr_bytes, idx_t position) { if (!can_seek) { if (is_pipe) { throw InternalException("Can't reconstruct the buffer from a on disk file."); } - //! If we can't seek in this file, we reset it and re-read up to the necessary point. + // If we can't seek in this file, we reset it and re-read up to the necessary point. + // This should only happen on extreme cases of memory pressure file_handle->Reset(); - // file_handle->Read(); + D_ASSERT(position % nr_bytes == 0); + for (idx_t i = 0; i < position / nr_bytes; i++) { + file_handle->Read(buffer, nr_bytes); + } + return; } file_handle->Seek(position); } @@ -48,6 +53,12 @@ bool CSVFileHandle::OnDiskFile() { return on_disk_file; } +void CSVFileHandle::Reset() { + file_handle->Reset(); + finished = false; + requested_bytes = 0; +} + bool CSVFileHandle::IsPipe() { return is_pipe; } diff --git a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index 3b60f247aa27..a62aed3ca3d7 100644 --- a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -93,6 +93,8 @@ SnifferResult CSVSniffer::SniffCSV(bool force_match) { DetectHeader(); // 5. Type Replacement ReplaceTypes(); + buffer_manager->ResetBufferManager(); + if (!best_candidate->error_handler->errors.empty() && !options.ignore_errors) { for (auto &error_vector : best_candidate->error_handler->errors) { for (auto &error : error_vector.second) { diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp index b9b4bb92d372..f8c6f246c3e6 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_buffer_manager.hpp @@ -42,6 +42,7 @@ class CSVBufferManager { //! once. bool Done(); + void ResetBufferManager(); string GetFilePath(); ClientContext &context; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp index 95a3cc6dafc9..7d4b55e424fd 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp @@ -26,10 +26,12 @@ struct CSVFileHandle { public: bool CanSeek(); - void Seek(idx_t position); + void Seek(void *buffer, idx_t nr_bytes, idx_t position); bool OnDiskFile(); bool IsPipe(); + void Reset(); + idx_t FileSize(); bool FinishedReading(); From 4eba1ab60d5259db6a8badf8d8cedcdbdff72884 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 26 Mar 2024 13:55:10 +0100 Subject: [PATCH 054/147] fix old parameter --- .../operator/csv_scanner/scanner/string_value_scanner.cpp | 2 +- .../csv_scanner/table_function/csv_file_scanner.cpp | 5 ++--- .../csv_scanner/table_function/global_csv_state.cpp | 7 +++---- .../execution/operator/csv_scanner/csv_file_scanner.hpp | 5 ++--- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index b1b2afa39c98..9582e1c1af2f 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -532,7 +532,7 @@ unique_ptr StringValueScanner::GetCSVScanner(ClientContext & state_machine->dialect_options.header = options.dialect_options.header; auto buffer_manager = make_shared(context, options, options.file_path, 0); auto scanner = make_uniq(buffer_manager, state_machine, make_shared()); - scanner->csv_file_scan = make_shared(context, options.file_path, options, false); + scanner->csv_file_scan = make_shared(context, options.file_path, options); scanner->csv_file_scan->InitializeProjection(); return scanner; } diff --git a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp index 7d975221c3ac..0532fc678a41 100644 --- a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp +++ b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp @@ -41,7 +41,7 @@ CSVFileScan::CSVFileScan(ClientContext &context, shared_ptr bu CSVFileScan::CSVFileScan(ClientContext &context, const string &file_path_p, const CSVReaderOptions &options_p, const idx_t file_idx_p, const ReadCSVData &bind_data, const vector &column_ids, - const vector &file_schema, bool single_threaded) + const vector &file_schema) : file_path(file_path_p), file_idx(file_idx_p), error_handler(make_shared(options_p.ignore_errors)), options(options_p) { if (file_idx < bind_data.union_readers.size()) { @@ -128,8 +128,7 @@ CSVFileScan::CSVFileScan(ClientContext &context, const string &file_path_p, cons InitializeFileNamesTypes(); } -CSVFileScan::CSVFileScan(ClientContext &context, const string &file_name, CSVReaderOptions &options_p, - bool single_threaded) +CSVFileScan::CSVFileScan(ClientContext &context, const string &file_name, CSVReaderOptions &options_p) : file_path(file_name), file_idx(0), error_handler(make_shared(options_p.ignore_errors)), options(options_p) { buffer_manager = make_shared(context, options, file_path, file_idx); diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 524482e0de55..e92ed51273ef 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -22,7 +22,7 @@ CSVGlobalState::CSVGlobalState(ClientContext &context_p, const shared_ptr(context, files[0], options, 0, bind_data, column_ids, file_schema, single_threaded)); + make_uniq(context, files[0], options, 0, bind_data, column_ids, file_schema)); }; //! There are situations where we only support single threaded scanning bool many_csv_files = files.size() > 1 && files.size() > system_threads * 2; @@ -68,8 +68,7 @@ unique_ptr CSVGlobalState::Next() { } else { lock_guard parallel_lock(main_mutex); file_scans.emplace_back(make_shared(context, bind_data.files[cur_idx], bind_data.options, - cur_idx, bind_data, column_ids, file_schema, - single_threaded)); + cur_idx, bind_data, column_ids, file_schema)); current_file = file_scans.back(); } auto csv_scanner = @@ -101,7 +100,7 @@ unique_ptr CSVGlobalState::Next() { // If we have a next file we have to construct the file scan for that file_scans.emplace_back(make_shared(context, bind_data.files[current_file_idx], bind_data.options, current_file_idx, bind_data, column_ids, - file_schema, single_threaded)); + file_schema)); // And re-start the boundary-iterator auto buffer_size = file_scans.back()->buffer_manager->GetBuffer(0)->actual_size; current_boundary = CSVIterator(current_file_idx, 0, 0, 0, buffer_size); diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp index ed859238d9ef..ce9fc08ce0bd 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp @@ -27,10 +27,9 @@ class CSVFileScan { //! Path to this file CSVFileScan(ClientContext &context, const string &file_path, const CSVReaderOptions &options, const idx_t file_idx, const ReadCSVData &bind_data, const vector &column_ids, - const vector &file_schema, bool single_threaded); + const vector &file_schema); - CSVFileScan(ClientContext &context, const string &file_name, CSVReaderOptions &options, - bool single_threaded = false); + CSVFileScan(ClientContext &context, const string &file_name, CSVReaderOptions &options); const string &GetFileName(); const vector &GetNames(); From ce9507077bbc837779c4ea60b711a271dbd113d5 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 26 Mar 2024 15:34:47 +0100 Subject: [PATCH 055/147] Cleanup buffer managers --- .../table_function/csv_file_scanner.cpp | 5 +++++ .../table_function/global_csv_state.cpp | 17 +++++++++++++++-- src/function/table/read_csv.cpp | 4 ++-- .../operator/csv_scanner/csv_file_scanner.hpp | 1 + .../operator/csv_scanner/global_csv_state.hpp | 4 +++- 5 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp index 0532fc678a41..f2ae71dbb8f8 100644 --- a/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp +++ b/src/execution/operator/csv_scanner/table_function/csv_file_scanner.cpp @@ -222,4 +222,9 @@ void CSVFileScan::InitializeProjection() { reader_data.column_mapping.push_back(i); } } + +void CSVFileScan::Finish() { + buffer_manager.reset(); +} + } // namespace duckdb diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 00a9052e7e48..9e8afd52a404 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -56,7 +56,7 @@ double CSVGlobalState::GetProgress(const ReadCSVData &bind_data_p) const { return percentage * 100; } -unique_ptr CSVGlobalState::Next() { +unique_ptr CSVGlobalState::Next(StringValueScanner *previous_scanner) { if (single_threaded) { idx_t cur_idx = last_file_idx++; if (cur_idx >= bind_data.files.size()) { @@ -71,6 +71,12 @@ unique_ptr CSVGlobalState::Next() { cur_idx, bind_data, column_ids, file_schema)); current_file = file_scans.back(); } + if (previous_scanner) { + lock_guard parallel_lock(main_mutex); + previous_scanner->buffer_tracker.reset(); + current_buffer_in_use.reset(); + previous_scanner->csv_file_scan->Finish(); + } auto csv_scanner = make_uniq(scanner_idx++, current_file->buffer_manager, current_file->state_machine, current_file->error_handler, current_file, false, current_boundary); @@ -89,7 +95,14 @@ unique_ptr CSVGlobalState::Next() { auto csv_scanner = make_uniq(scanner_idx++, current_file.buffer_manager, current_file.state_machine, current_file.error_handler, file_scans.back(), false, current_boundary); - + threads_per_file[csv_scanner->csv_file_scan->file_idx]++; + if (previous_scanner) { + threads_per_file[previous_scanner->csv_file_scan->file_idx]--; + if (threads_per_file[previous_scanner->csv_file_scan->file_idx] == 0) { + previous_scanner->buffer_tracker.reset(); + previous_scanner->csv_file_scan->Finish(); + } + } csv_scanner->buffer_tracker = current_buffer_in_use; // We then produce the next boundary diff --git a/src/function/table/read_csv.cpp b/src/function/table/read_csv.cpp index 8d2e1be0d780..71e6f2255a7c 100644 --- a/src/function/table/read_csv.cpp +++ b/src/function/table/read_csv.cpp @@ -185,7 +185,7 @@ unique_ptr ReadCSVInitLocal(ExecutionContext &context, return nullptr; } auto &global_state = global_state_p->Cast(); - auto csv_scanner = global_state.Next(); + auto csv_scanner = global_state.Next(nullptr); if (!csv_scanner) { global_state.DecrementThread(); } @@ -211,7 +211,7 @@ static void ReadCSVFunction(ClientContext &context, TableFunctionInput &data_p, break; } if (csv_local_state.csv_reader->FinishedIterator()) { - csv_local_state.csv_reader = csv_global_state.Next(); + csv_local_state.csv_reader = csv_global_state.Next(csv_local_state.csv_reader.get()); if (!csv_local_state.csv_reader) { csv_global_state.DecrementThread(); break; diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp index ce9fc08ce0bd..0ba7c0e02dcd 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_file_scanner.hpp @@ -35,6 +35,7 @@ class CSVFileScan { const vector &GetNames(); const vector &GetTypes(); void InitializeProjection(); + void Finish(); //! Initialize the actual names and types to be scanned from the file void InitializeFileNamesTypes(); diff --git a/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp b/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp index bbeb2bfee094..4d123480f45c 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp @@ -30,7 +30,7 @@ struct CSVGlobalState : public GlobalTableFunctionState { //! Generates a CSV Scanner, with information regarding the piece of buffer it should be read. //! In case it returns a nullptr it means we are done reading these files. - unique_ptr Next(); + unique_ptr Next(StringValueScanner *previous_scanner); void FillRejectsTable(); @@ -75,6 +75,8 @@ struct CSVGlobalState : public GlobalTableFunctionState { atomic last_file_idx; shared_ptr current_buffer_in_use; + + unordered_map threads_per_file; }; } // namespace duckdb From a2ee8fc86e956b5cf9592c24f5be4e75b4b0f038 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 26 Mar 2024 15:58:54 +0100 Subject: [PATCH 056/147] IsPipe should check pipe --- .../operator/csv_scanner/buffer_manager/csv_buffer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp index 2c85e0ee2924..0886a58f1a35 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp @@ -5,7 +5,7 @@ namespace duckdb { CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle &file_handle, idx_t &global_csv_current_position, idx_t file_number_p) - : context(context), file_number(file_number_p), can_seek(file_handle.CanSeek()), is_pipe(file_handle.OnDiskFile()) { + : context(context), file_number(file_number_p), can_seek(file_handle.CanSeek()), is_pipe(file_handle.IsPipe()) { AllocateBuffer(buffer_size_p); auto buffer = Ptr(); actual_buffer_size = file_handle.Read(buffer, buffer_size_p); @@ -20,7 +20,7 @@ CSVBuffer::CSVBuffer(ClientContext &context, idx_t buffer_size_p, CSVFileHandle CSVBuffer::CSVBuffer(CSVFileHandle &file_handle, ClientContext &context, idx_t buffer_size, idx_t global_csv_current_position, idx_t file_number_p, idx_t buffer_idx_p) : context(context), global_csv_start(global_csv_current_position), file_number(file_number_p), - can_seek(file_handle.CanSeek()), is_pipe(file_handle.OnDiskFile()), buffer_idx(buffer_idx_p) { + can_seek(file_handle.CanSeek()), is_pipe(file_handle.IsPipe()), buffer_idx(buffer_idx_p) { AllocateBuffer(buffer_size); auto buffer = handle.Ptr(); actual_buffer_size = file_handle.Read(handle.Ptr(), buffer_size); From 3fdf469f191372f6932f41cb070532154ae32162 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 26 Mar 2024 18:12:42 +0100 Subject: [PATCH 057/147] fix small pipe bug --- .../operator/csv_scanner/buffer_manager/csv_buffer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp index 0886a58f1a35..7d2913e22e81 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp @@ -68,7 +68,7 @@ void CSVBuffer::Reload(CSVFileHandle &file_handle) { shared_ptr CSVBuffer::Pin(CSVFileHandle &file_handle, bool &has_seeked) { auto &buffer_manager = BufferManager::GetBufferManager(context); - if (is_pipe && block->IsUnloaded()) { + if (!is_pipe && block->IsUnloaded()) { // We have to reload it from disk block = nullptr; Reload(file_handle); From 111e9eefef6d7f253258a6fd1dca867975c5b8ef Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 2 Apr 2024 12:28:26 +0200 Subject: [PATCH 058/147] PR requests and adding/running big tests --- .../csv_scanner/buffer_manager/csv_buffer.cpp | 4 +- .../buffer_manager/csv_file_handle.cpp | 13 ++--- .../csv_scanner/sniffer/csv_sniffer.cpp | 8 ++- .../table_function/global_csv_state.cpp | 4 +- .../operator/csv_scanner/csv_file_handle.hpp | 2 +- .../operator/csv_scanner/global_csv_state.hpp | 2 +- .../copy/csv/test_big_compressed.test_slow | 32 +++++++++++ ...est_multiple_big_compressed_csvs.test_slow | 53 +++++++++++++++++++ 8 files changed, 101 insertions(+), 17 deletions(-) create mode 100644 test/sql/copy/csv/test_big_compressed.test_slow create mode 100644 test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp index 7d2913e22e81..79a1e8fd762b 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_buffer.cpp @@ -36,7 +36,7 @@ shared_ptr CSVBuffer::Next(CSVFileHandle &file_handle, idx_t buffer_s if (has_seaked) { // This means that at some point a reload was done, and we are currently on the incorrect position in our file // handle - file_handle.Seek(handle.Ptr(), actual_buffer_size, global_csv_start + actual_buffer_size); + file_handle.Seek(global_csv_start + actual_buffer_size); has_seaked = false; } auto next_csv_buffer = make_shared(file_handle, context, buffer_size, @@ -62,7 +62,7 @@ idx_t CSVBuffer::GetBufferSize() { void CSVBuffer::Reload(CSVFileHandle &file_handle) { AllocateBuffer(actual_buffer_size); // If we can seek, we seek and return the correct pointers - file_handle.Seek(handle.Ptr(), actual_buffer_size, global_csv_start); + file_handle.Seek(global_csv_start); file_handle.Read(handle.Ptr(), actual_buffer_size); } diff --git a/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp b/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp index 9db0131db294..d37e38be14ec 100644 --- a/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp +++ b/src/execution/operator/csv_scanner/buffer_manager/csv_file_handle.cpp @@ -32,19 +32,12 @@ bool CSVFileHandle::CanSeek() { return can_seek; } -void CSVFileHandle::Seek(void *buffer, idx_t nr_bytes, idx_t position) { +void CSVFileHandle::Seek(idx_t position) { if (!can_seek) { if (is_pipe) { - throw InternalException("Can't reconstruct the buffer from a on disk file."); + throw InternalException("Trying to seek a piped CSV File."); } - // If we can't seek in this file, we reset it and re-read up to the necessary point. - // This should only happen on extreme cases of memory pressure - file_handle->Reset(); - D_ASSERT(position % nr_bytes == 0); - for (idx_t i = 0; i < position / nr_bytes; i++) { - file_handle->Read(buffer, nr_bytes); - } - return; + throw InternalException("Trying to seek a compressed CSV File."); } file_handle->Seek(position); } diff --git a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp index a62aed3ca3d7..057120591132 100644 --- a/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp +++ b/src/execution/operator/csv_scanner/sniffer/csv_sniffer.cpp @@ -93,7 +93,13 @@ SnifferResult CSVSniffer::SniffCSV(bool force_match) { DetectHeader(); // 5. Type Replacement ReplaceTypes(); - buffer_manager->ResetBufferManager(); + + // We reset the buffer for compressed files + // This is done because we can't easily seek on compressed files, if a buffer goes out of scope we must read from + // the start + if (!buffer_manager->file_handle->uncompressed) { + buffer_manager->ResetBufferManager(); + } if (!best_candidate->error_handler->errors.empty() && !options.ignore_errors) { for (auto &error_vector : best_candidate->error_handler->errors) { diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 9e8afd52a404..a3cca5a55ea2 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -24,7 +24,7 @@ CSVGlobalState::CSVGlobalState(ClientContext &context_p, const shared_ptr(context, files[0], options, 0, bind_data, column_ids, file_schema)); }; - //! There are situations where we only support single threaded scanning + // There are situations where we only support single threaded scanning bool many_csv_files = files.size() > 1 && files.size() > system_threads * 2; single_threaded = many_csv_files || !options.parallel; last_file_idx = 0; @@ -56,7 +56,7 @@ double CSVGlobalState::GetProgress(const ReadCSVData &bind_data_p) const { return percentage * 100; } -unique_ptr CSVGlobalState::Next(StringValueScanner *previous_scanner) { +unique_ptr CSVGlobalState::Next(optional_ptr previous_scanner) { if (single_threaded) { idx_t cur_idx = last_file_idx++; if (cur_idx >= bind_data.files.size()) { diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp index 7d4b55e424fd..43e4e275583e 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_file_handle.hpp @@ -26,7 +26,7 @@ struct CSVFileHandle { public: bool CanSeek(); - void Seek(void *buffer, idx_t nr_bytes, idx_t position); + void Seek(idx_t position); bool OnDiskFile(); bool IsPipe(); diff --git a/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp b/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp index 4d123480f45c..648948c19e37 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/global_csv_state.hpp @@ -30,7 +30,7 @@ struct CSVGlobalState : public GlobalTableFunctionState { //! Generates a CSV Scanner, with information regarding the piece of buffer it should be read. //! In case it returns a nullptr it means we are done reading these files. - unique_ptr Next(StringValueScanner *previous_scanner); + unique_ptr Next(optional_ptr previous_scanner); void FillRejectsTable(); diff --git a/test/sql/copy/csv/test_big_compressed.test_slow b/test/sql/copy/csv/test_big_compressed.test_slow new file mode 100644 index 000000000000..eaa766ac2fa6 --- /dev/null +++ b/test/sql/copy/csv/test_big_compressed.test_slow @@ -0,0 +1,32 @@ +# name: test/sql/copy/csv/test_big_compressed.test_slow +# description: Test scan over multiple compressed big csv files +# group: [csv] + +# This test is way too slow to run on CI, generating a SF100 TPC-H lineitem file takes a LOT of time. +# Still useful to check for problems locally. +mode skip + +require tpch + +statement ok +CALL dbgen(sf=100); + +statement ok +copy lineitem to '__TEST_DIR__/lineitem_100.csv.gz'; + +statement ok +SET temp_directory='' + +# load the DB from disk (Avoids OOM when generating ze table) +load __TEST_DIR__/lineitem_100_compressed.db + +statement ok +CREATE TABLE lineitem_2(l_orderkey INTEGER NOT NULL, l_partkey INTEGER NOT NULL, l_suppkey INTEGER NOT NULL, l_linenumber INTEGER NOT NULL, l_quantity DECIMAL(15,2) NOT NULL, l_extendedprice DECIMAL(15,2) NOT NULL, l_discount DECIMAL(15,2) NOT NULL, l_tax DECIMAL(15,2) NOT NULL, l_returnflag VARCHAR NOT NULL, l_linestatus VARCHAR NOT NULL, l_shipdate DATE NOT NULL, l_commitdate DATE NOT NULL, l_receiptdate DATE NOT NULL, l_shipinstruct VARCHAR NOT NULL, l_shipmode VARCHAR NOT NULL, l_comment VARCHAR NOT NULL); + +statement ok +INSERT INTO lineitem_2 FROM '__TEST_DIR__/lineitem_100.csv.gz'; + +query I +select count(*) from lineitem_2 +---- +600037902 \ No newline at end of file diff --git a/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow b/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow new file mode 100644 index 000000000000..f4183fb24401 --- /dev/null +++ b/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow @@ -0,0 +1,53 @@ +# name: test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow +# description: Test scan over multiple compressed big csv files +# group: [csv] + +require tpch + +statement ok +CALL dbgen(sf=10); + +statement ok +copy lineitem to ' __TEST_DIR__/lineitem.csv.gz'; + +statement ok +SET temp_directory='' + +# load the DB from disk (Avoids OOM when generating ze table) +load __TEST_DIR__/lineitem_compressed.db + +statement ok +CREATE TABLE lineitem_2(l_orderkey INTEGER NOT NULL, l_partkey INTEGER NOT NULL, l_suppkey INTEGER NOT NULL, l_linenumber INTEGER NOT NULL, l_quantity DECIMAL(15,2) NOT NULL, l_extendedprice DECIMAL(15,2) NOT NULL, l_discount DECIMAL(15,2) NOT NULL, l_tax DECIMAL(15,2) NOT NULL, l_returnflag VARCHAR NOT NULL, l_linestatus VARCHAR NOT NULL, l_shipdate DATE NOT NULL, l_commitdate DATE NOT NULL, l_receiptdate DATE NOT NULL, l_shipinstruct VARCHAR NOT NULL, l_shipmode VARCHAR NOT NULL, l_comment VARCHAR NOT NULL); + +statement ok +INSERT INTO lineitem_2 FROM read_csv([ + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', + ' __TEST_DIR__/lineitem.csv.gz', +]); + +query I +select count(*) from lineitem_2 +---- +1439665248 \ No newline at end of file From 55112d11ec4612eea3db506b162e9d69a4f98ddd Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 2 Apr 2024 13:22:59 +0200 Subject: [PATCH 059/147] Woopsie on path --- ...est_multiple_big_compressed_csvs.test_slow | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow b/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow index f4183fb24401..07b99f83c601 100644 --- a/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow +++ b/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow @@ -8,7 +8,7 @@ statement ok CALL dbgen(sf=10); statement ok -copy lineitem to ' __TEST_DIR__/lineitem.csv.gz'; +copy lineitem to '__TEST_DIR__/lineitem.csv.gz'; statement ok SET temp_directory='' @@ -21,30 +21,30 @@ CREATE TABLE lineitem_2(l_orderkey INTEGER NOT NULL, l_partkey INTEGER NOT NULL, statement ok INSERT INTO lineitem_2 FROM read_csv([ - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', - ' __TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', + '__TEST_DIR__/lineitem.csv.gz', ]); query I From 30beaa82080cd95bb7ea52516460f120f94bc199 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 2 Apr 2024 18:11:00 +0200 Subject: [PATCH 060/147] More on merge --- .github/regression/micro_extended.csv | 1 + .../scanner/string_value_scanner.cpp | 21 +++---- .../csv_scanner/sniffer/dialect_detection.cpp | 4 +- .../operator/csv_scanner/util/csv_error.cpp | 58 +++++++++---------- .../operator/csv_scanner/csv_error.hpp | 3 +- src/storage/serialization/serialize_nodes.cpp | 56 +++++++++--------- 6 files changed, 67 insertions(+), 76 deletions(-) diff --git a/.github/regression/micro_extended.csv b/.github/regression/micro_extended.csv index 6973785b4c98..a9517ef309b4 100644 --- a/.github/regression/micro_extended.csv +++ b/.github/regression/micro_extended.csv @@ -78,6 +78,7 @@ benchmark/micro/copy/to_parquet_partition_by_few.benchmark benchmark/micro/copy/to_parquet_partition_by_many.benchmark benchmark/micro/csv/16_byte_values.benchmark benchmark/micro/csv/1_byte_values.benchmark +benchmark/micro/csv/1brl.benchmark benchmark/micro/csv/multiple_read.benchmark benchmark/micro/csv/multiple_small_read_csv.benchmark benchmark/micro/csv/null_padding.benchmark diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index f8018e5fcdde..448dc8382c4d 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -380,10 +380,10 @@ bool StringValueResult::HandleError() { line_pos.GetGlobalPosition(requested_size, first_nl)); break; case CSVErrorType::CAST_ERROR: - csv_error = CSVError::CastError(state_machine.options, names[cur_error.col_idx], cur_error.error_message, - cur_error.col_idx, borked_line, lines_per_batch, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl), - line_pos.GetGlobalPosition(requested_size, first_nl)); + csv_error = CSVError::CastError( + state_machine.options, names[cur_error.col_idx], cur_error.error_message, cur_error.col_idx, + borked_line, lines_per_batch, current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + line_pos.GetGlobalPosition(requested_size, first_nl), parse_types[cur_error.col_idx].first); break; default: throw InvalidInputException("CSV Error not allowed when inserting row"); @@ -730,11 +730,8 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { auto csv_error = CSVError::CastError( state_machine->options, csv_file_scan->names[col_idx], error_message, col_idx, borked_line, lines_per_batch, - result.line_positions_per_row[line_error].begin.GetGlobalPosition(result.result_size, first_nl), - -1); - auto csv_error = - CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], error_message, col_idx, - row, lines_per_batch, result_vector.GetType().id()); + result.line_positions_per_row[line_error].begin.GetGlobalPosition(result.result_size, first_nl), -1, + result_vector.GetType().id()); error_handler->Error(csv_error); } @@ -758,11 +755,7 @@ void StringValueScanner::Flush(DataChunk &insert_chunk) { state_machine->options, csv_file_scan->names[col_idx], error_message, col_idx, borked_line, lines_per_batch, result.line_positions_per_row[line_error].begin.GetGlobalPosition(result.result_size, first_nl), - -1); - auto csv_error = - CSVError::CastError(state_machine->options, csv_file_scan->names[col_idx], error_message, - col_idx, row, lines_per_batch, result_vector.GetType().id()); - + -1, result_vector.GetType().id()); error_handler->Error(csv_error); } } diff --git a/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp b/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp index 03173547fa98..e6ffae7e30e7 100644 --- a/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp +++ b/src/execution/operator/csv_scanner/sniffer/dialect_detection.cpp @@ -185,7 +185,7 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr scanner, best_consistent_rows = consistent_rows; max_columns_found = num_cols; prev_padding_count = padding_count; - if (!options.null_padding && !options.ignore_errors) { + if (!options.null_padding && !options.ignore_errors.GetValue()) { sniffing_state_machine.dialect_options.skip_rows = start_row; } else { sniffing_state_machine.dialect_options.skip_rows = options.dialect_options.skip_rows.GetValue(); @@ -210,7 +210,7 @@ void CSVSniffer::AnalyzeDialectCandidate(unique_ptr scanner, } } if (!same_quote_is_candidate) { - if (!options.null_padding && !options.ignore_errors) { + if (!options.null_padding && !options.ignore_errors.GetValue()) { sniffing_state_machine.dialect_options.skip_rows = start_row; } else { sniffing_state_machine.dialect_options.skip_rows = options.dialect_options.skip_rows.GetValue(); diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index b6236553fbf0..7c7260c5d2e4 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -114,9 +114,8 @@ CSVError CSVError::ColumnTypesError(case_insensitive_map_t sql_types_per_ } CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, idx_t column_idx, - vector &row, LinesPerBoundary error_info, LogicalTypeId type) { string &csv_row, LinesPerBoundary error_info, idx_t row_byte_position, - int64_t byte_position) { + int64_t byte_position, LogicalTypeId type) { std::ostringstream error; // Which column error << "Error when converting column \"" << column_name << "\"." << '\n'; @@ -139,16 +138,16 @@ CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_nam << '\n'; } -return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, csv_row, error_info, row_byte_position, + return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, csv_row, error_info, row_byte_position, byte_position, options); - } +} CSVError CSVError::LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info, string &csv_row, idx_t byte_position) { std::ostringstream error; error << "Maximum line size of " << options.maximum_line_size << " bytes exceeded. "; error << "Actual Size:" << actual_size << " bytes." << '\n'; -return CSVError(error.str(), CSVErrorType::MAXIMUM_LINE_SIZE, 0, csv_row, error_info, byte_position, byte_position, + return CSVError(error.str(), CSVErrorType::MAXIMUM_LINE_SIZE, 0, csv_row, error_info, byte_position, byte_position, options); } @@ -176,45 +175,44 @@ CSVError CSVError::UnterminatedQuotesError(const CSVReaderOptions &options, idx_ std::ostringstream error; error << "Value with unterminated quote found." << '\n'; error << '\n'; -return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, current_column, csv_row, error_info, - row_byte_position, byte_position, options);} + return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, current_column, csv_row, error_info, + row_byte_position, byte_position, options); +} CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, idx_t actual_columns, LinesPerBoundary error_info, string &csv_row, idx_t row_byte_position, int64_t byte_position) { std::ostringstream error; - // How many columns were expected and how many were found -// error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns + 1; -// if (actual_columns >= options.dialect_options.num_cols) { -// return CSVError(error.str(), CSVErrorType::TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, -// row_byte_position, byte_position, options); -// } else { -// return CSVError(error.str(), CSVErrorType::TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, -// row_byte_position, byte_position, options); -// } - - // How many columns were expected and how many were found - error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns << '\n'; - error << '\n' << "Possible fixes:" << '\n'; - if (!options.null_padding) { - error << "* Enable null padding (null_padding=true) to replace missing values with NULL" << '\n'; - } - if (!options.ignore_errors) { - error << "* Enable ignore errors (ignore_errors=true) to skip this row" << '\n'; + error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns + 1; + if (actual_columns >= options.dialect_options.num_cols) { + return CSVError(error.str(), CSVErrorType::TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, + row_byte_position, byte_position, options); + } else { + return CSVError(error.str(), CSVErrorType::TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, + row_byte_position, byte_position, options); } - error << '\n'; - // What were the options - error << options.ToString(); - return CSVError(error.str(), CSVErrorType::INCORRECT_COLUMN_AMOUNT, error_info); + + // // How many columns were expected and how many were found + // error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns << + //'\n'; error << '\n' << "Possible fixes:" << '\n'; if (!options.null_padding) { error << "* Enable null padding + //(null_padding=true) to replace missing values with NULL" << '\n'; + // } + // if (!options.ignore_errors) { + // error << "* Enable ignore errors (ignore_errors=true) to skip this row" << '\n'; + // } + // error << '\n'; + // // What were the options + // error << options.ToString(); + // return CSVError(error.str(), CSVErrorType::INCORRECT_COLUMN_AMOUNT, error_info); } CSVError CSVError::InvalidUTF8(const CSVReaderOptions &options, idx_t current_column, LinesPerBoundary error_info, string &csv_row, idx_t row_byte_position, int64_t byte_position) { std::ostringstream error; // How many columns were expected and how many were found - error << "Invalid unicode (byte sequence mismatch) detected."<< '\n'; + error << "Invalid unicode (byte sequence mismatch) detected." << '\n'; return CSVError(error.str(), CSVErrorType::INVALID_UNICODE, current_column, csv_row, error_info, row_byte_position, byte_position, options); } diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index ea44c54462ff..1ffc21423344 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -59,8 +59,7 @@ class CSVError { //! Produces error messages for casting errors static CSVError CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, idx_t column_idx, string &csv_row, LinesPerBoundary error_info, idx_t row_byte_position, - int64_t byte_position); - idx_t column_idx, vector &row, LinesPerBoundary error_info, LogicalTypeId type); + int64_t byte_position, LogicalTypeId type); //! Produces error for when the line size exceeds the maximum line size option static CSVError LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info, string &csv_row, idx_t byte_position); diff --git a/src/storage/serialization/serialize_nodes.cpp b/src/storage/serialization/serialize_nodes.cpp index 44531988a2cd..2359d127ef24 100644 --- a/src/storage/serialization/serialize_nodes.cpp +++ b/src/storage/serialization/serialize_nodes.cpp @@ -118,7 +118,7 @@ CSVOption CSVOption::Deserialize(Deserializer &deserializer) { } void CSVReaderOptions::Serialize(Serializer &serializer) const { - serializer.WritePropertyWithDefault(100, "ignore_errors", ignore_errors); + serializer.WriteProperty>(100, "ignore_errors", ignore_errors); serializer.WritePropertyWithDefault(101, "buffer_sample_size", buffer_sample_size); serializer.WritePropertyWithDefault(102, "null_str", null_str); serializer.WriteProperty(103, "compression", compression); @@ -135,26 +135,26 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault(114, "buffer_size", buffer_size); serializer.WriteProperty(115, "file_options", file_options); serializer.WritePropertyWithDefault>(116, "force_quote", force_quote); - serializer.WritePropertyWithDefault(117, "rejects_table_name", rejects_table_name); + serializer.WriteProperty>(117, "store_rejects", store_rejects); serializer.WritePropertyWithDefault(118, "rejects_limit", rejects_limit); - serializer.WritePropertyWithDefault>(119, "rejects_recovery_columns", rejects_recovery_columns); - serializer.WritePropertyWithDefault>(120, "rejects_recovery_column_ids", rejects_recovery_column_ids); - serializer.WriteProperty>(121, "dialect_options.state_machine_options.delimiter", dialect_options.state_machine_options.delimiter); - serializer.WriteProperty>(122, "dialect_options.state_machine_options.quote", dialect_options.state_machine_options.quote); - serializer.WriteProperty>(123, "dialect_options.state_machine_options.escape", dialect_options.state_machine_options.escape); - serializer.WriteProperty>(124, "dialect_options.header", dialect_options.header); - serializer.WritePropertyWithDefault(125, "dialect_options.num_cols", dialect_options.num_cols); - serializer.WriteProperty>(126, "dialect_options.state_machine_options.new_line", dialect_options.state_machine_options.new_line); - serializer.WriteProperty>(127, "dialect_options.skip_rows", dialect_options.skip_rows); - serializer.WriteProperty>>(128, "dialect_options.date_format", dialect_options.date_format); - serializer.WritePropertyWithDefault(129, "sniffer_user_mismatch_error", sniffer_user_mismatch_error); - serializer.WritePropertyWithDefault(130, "parallel", parallel); + serializer.WriteProperty>(119, "dialect_options.state_machine_options.delimiter", dialect_options.state_machine_options.delimiter); + serializer.WriteProperty>(120, "dialect_options.state_machine_options.quote", dialect_options.state_machine_options.quote); + serializer.WriteProperty>(121, "dialect_options.state_machine_options.escape", dialect_options.state_machine_options.escape); + serializer.WriteProperty>(122, "dialect_options.header", dialect_options.header); + serializer.WritePropertyWithDefault(123, "dialect_options.num_cols", dialect_options.num_cols); + serializer.WriteProperty>(124, "dialect_options.state_machine_options.new_line", dialect_options.state_machine_options.new_line); + serializer.WriteProperty>(125, "dialect_options.skip_rows", dialect_options.skip_rows); + serializer.WriteProperty>>(126, "dialect_options.date_format", dialect_options.date_format); + serializer.WritePropertyWithDefault(127, "sniffer_user_mismatch_error", sniffer_user_mismatch_error); + serializer.WritePropertyWithDefault(128, "parallel", parallel); + serializer.WriteProperty>(129, "rejects_table_name", rejects_table_name); + serializer.WriteProperty>(130, "rejects_scan_name", rejects_scan_name); serializer.WritePropertyWithDefault>(131, "was_type_manually_set", was_type_manually_set); } CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { CSVReaderOptions result; - deserializer.ReadPropertyWithDefault(100, "ignore_errors", result.ignore_errors); + deserializer.ReadProperty>(100, "ignore_errors", result.ignore_errors); deserializer.ReadPropertyWithDefault(101, "buffer_sample_size", result.buffer_sample_size); deserializer.ReadPropertyWithDefault(102, "null_str", result.null_str); deserializer.ReadProperty(103, "compression", result.compression); @@ -171,20 +171,20 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { deserializer.ReadPropertyWithDefault(114, "buffer_size", result.buffer_size); deserializer.ReadProperty(115, "file_options", result.file_options); deserializer.ReadPropertyWithDefault>(116, "force_quote", result.force_quote); - deserializer.ReadPropertyWithDefault(117, "rejects_table_name", result.rejects_table_name); + deserializer.ReadProperty>(117, "store_rejects", result.store_rejects); deserializer.ReadPropertyWithDefault(118, "rejects_limit", result.rejects_limit); - deserializer.ReadPropertyWithDefault>(119, "rejects_recovery_columns", result.rejects_recovery_columns); - deserializer.ReadPropertyWithDefault>(120, "rejects_recovery_column_ids", result.rejects_recovery_column_ids); - deserializer.ReadProperty>(121, "dialect_options.state_machine_options.delimiter", result.dialect_options.state_machine_options.delimiter); - deserializer.ReadProperty>(122, "dialect_options.state_machine_options.quote", result.dialect_options.state_machine_options.quote); - deserializer.ReadProperty>(123, "dialect_options.state_machine_options.escape", result.dialect_options.state_machine_options.escape); - deserializer.ReadProperty>(124, "dialect_options.header", result.dialect_options.header); - deserializer.ReadPropertyWithDefault(125, "dialect_options.num_cols", result.dialect_options.num_cols); - deserializer.ReadProperty>(126, "dialect_options.state_machine_options.new_line", result.dialect_options.state_machine_options.new_line); - deserializer.ReadProperty>(127, "dialect_options.skip_rows", result.dialect_options.skip_rows); - deserializer.ReadProperty>>(128, "dialect_options.date_format", result.dialect_options.date_format); - deserializer.ReadPropertyWithDefault(129, "sniffer_user_mismatch_error", result.sniffer_user_mismatch_error); - deserializer.ReadPropertyWithDefault(130, "parallel", result.parallel); + deserializer.ReadProperty>(119, "dialect_options.state_machine_options.delimiter", result.dialect_options.state_machine_options.delimiter); + deserializer.ReadProperty>(120, "dialect_options.state_machine_options.quote", result.dialect_options.state_machine_options.quote); + deserializer.ReadProperty>(121, "dialect_options.state_machine_options.escape", result.dialect_options.state_machine_options.escape); + deserializer.ReadProperty>(122, "dialect_options.header", result.dialect_options.header); + deserializer.ReadPropertyWithDefault(123, "dialect_options.num_cols", result.dialect_options.num_cols); + deserializer.ReadProperty>(124, "dialect_options.state_machine_options.new_line", result.dialect_options.state_machine_options.new_line); + deserializer.ReadProperty>(125, "dialect_options.skip_rows", result.dialect_options.skip_rows); + deserializer.ReadProperty>>(126, "dialect_options.date_format", result.dialect_options.date_format); + deserializer.ReadPropertyWithDefault(127, "sniffer_user_mismatch_error", result.sniffer_user_mismatch_error); + deserializer.ReadPropertyWithDefault(128, "parallel", result.parallel); + deserializer.ReadProperty>(129, "rejects_table_name", result.rejects_table_name); + deserializer.ReadProperty>(130, "rejects_scan_name", result.rejects_scan_name); deserializer.ReadPropertyWithDefault>(131, "was_type_manually_set", result.was_type_manually_set); return result; } From 23680a061fde7a20ec711bc6645996e6c149d73e Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 2 Apr 2024 18:53:06 +0200 Subject: [PATCH 061/147] Forcing fix message to be set to all csv errors --- .../operator/csv_scanner/util/csv_error.cpp | 87 ++++++++++--------- .../operator/csv_scanner/csv_error.hpp | 10 ++- 2 files changed, 54 insertions(+), 43 deletions(-) diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index 7c7260c5d2e4..dbe43899e6e2 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -18,10 +18,10 @@ void CSVErrorHandler::ThrowError(CSVError csv_error) { if (PrintLineNumber(csv_error)) { error << "CSV Error on Line: " << GetLine(csv_error.error_info) << '\n'; } - if (csv_error.error_message_with_options.empty()) { + if (csv_error.full_error_message.empty()) { error << csv_error.error_message; } else { - error << csv_error.error_message_with_options; + error << csv_error.full_error_message; } switch (csv_error.type) { case CSVErrorType::CAST_ERROR: @@ -82,15 +82,16 @@ CSVError::CSVError(string error_message_p, CSVErrorType type_p, LinesPerBoundary CSVError::CSVError(string error_message_p, CSVErrorType type_p, idx_t column_idx_p, string csv_row_p, LinesPerBoundary error_info_p, idx_t row_byte_position, int64_t byte_position_p, - const CSVReaderOptions &reader_options) + const CSVReaderOptions &reader_options, const string &fixes) : error_message(std::move(error_message_p)), type(type_p), column_idx(column_idx_p), csv_row(std::move(csv_row_p)), error_info(error_info_p), row_byte_position(row_byte_position), byte_position(byte_position_p) { // What were the options std::ostringstream error; - error << error_message << std::endl; + error << error_message << '\n'; + error << fixes << '\n'; error << reader_options.ToString(); - error << std::endl; - error_message_with_options = error.str(); + error << '\n'; + full_error_message = error.str(); } CSVError CSVError::ColumnTypesError(case_insensitive_map_t sql_types_per_column, const vector &names) { @@ -118,28 +119,30 @@ CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_nam int64_t byte_position, LogicalTypeId type) { std::ostringstream error; // Which column - error << "Error when converting column \"" << column_name << "\"." << '\n'; + error << "Error when converting column \"" << column_name << "\". "; // What was the cast error error << cast_error << '\n'; - - error << "Column " << column_name << " is being converted as type " << LogicalTypeIdToString(type) << '\n'; + std::ostringstream how_to_fix_it; + how_to_fix_it << "Column " << column_name << " is being converted as type " << LogicalTypeIdToString(type) << '\n'; if (!options.WasTypeManuallySet(column_idx)) { - error << "This type was auto-detected from the CSV file." << '\n'; - error << "Possible solutions:" << '\n'; - error << "* Override the type for this column manually by setting the type explicitly, e.g. types={'" - << column_name << "': 'VARCHAR'}" << '\n'; - error << "* Set the sample size to a larger value to enable the auto-detection to scan more values, e.g. " - "sample_size=-1" - << '\n'; - error << "* Use a COPY statement to automatically derive types from an existing table." << '\n'; + how_to_fix_it << "This type was auto-detected from the CSV file." << '\n'; + how_to_fix_it << "Possible solutions:" << '\n'; + how_to_fix_it << "* Override the type for this column manually by setting the type explicitly, e.g. types={'" + << column_name << "': 'VARCHAR'}" << '\n'; + how_to_fix_it + << "* Set the sample size to a larger value to enable the auto-detection to scan more values, e.g. " + "sample_size=-1" + << '\n'; + how_to_fix_it << "* Use a COPY statement to automatically derive types from an existing table." << '\n'; } else { - error << "This type was either manually set or derived from an existing table. Select a different type to " - "correctly parse this column." - << '\n'; + how_to_fix_it + << "This type was either manually set or derived from an existing table. Select a different type to " + "correctly parse this column." + << '\n'; } return CSVError(error.str(), CSVErrorType::CAST_ERROR, column_idx, csv_row, error_info, row_byte_position, - byte_position, options); + byte_position, options, how_to_fix_it.str()); } CSVError CSVError::LineSizeError(const CSVReaderOptions &options, idx_t actual_size, LinesPerBoundary error_info, @@ -147,8 +150,13 @@ CSVError CSVError::LineSizeError(const CSVReaderOptions &options, idx_t actual_s std::ostringstream error; error << "Maximum line size of " << options.maximum_line_size << " bytes exceeded. "; error << "Actual Size:" << actual_size << " bytes." << '\n'; + + std::ostringstream how_to_fix_it; + how_to_fix_it << "Possible Solution: Change the maximum length size, e.g., max_line_size=" << actual_size + 1 + << "\n"; + return CSVError(error.str(), CSVErrorType::MAXIMUM_LINE_SIZE, 0, csv_row, error_info, byte_position, byte_position, - options); + options, how_to_fix_it.str()); } CSVError CSVError::SniffingError(string &file_path) { @@ -175,37 +183,34 @@ CSVError CSVError::UnterminatedQuotesError(const CSVReaderOptions &options, idx_ std::ostringstream error; error << "Value with unterminated quote found." << '\n'; error << '\n'; + std::ostringstream how_to_fix_it; + how_to_fix_it << "Possible Solution: Enable ignore errors (ignore_errors=true) to skip this row" << '\n'; return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, current_column, csv_row, error_info, - row_byte_position, byte_position, options); + row_byte_position, byte_position, options, how_to_fix_it.str()); } CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, idx_t actual_columns, LinesPerBoundary error_info, string &csv_row, idx_t row_byte_position, int64_t byte_position) { std::ostringstream error; - + // We don't have a fix for this + std::ostringstream how_to_fix_it; + how_to_fix_it << "Possible fixes:" << '\n'; + if (!options.null_padding) { + how_to_fix_it << "* Enable null padding (null_padding=true) to replace missing values with NULL" << '\n'; + } + if (!options.ignore_errors.GetValue()) { + how_to_fix_it << "* Enable ignore errors (ignore_errors=true) to skip this row" << '\n'; + } // How many columns were expected and how many were found error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns + 1; if (actual_columns >= options.dialect_options.num_cols) { return CSVError(error.str(), CSVErrorType::TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, - row_byte_position, byte_position, options); + row_byte_position, byte_position, options, how_to_fix_it.str()); } else { return CSVError(error.str(), CSVErrorType::TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, - row_byte_position, byte_position, options); + row_byte_position, byte_position, options, how_to_fix_it.str()); } - - // // How many columns were expected and how many were found - // error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns << - //'\n'; error << '\n' << "Possible fixes:" << '\n'; if (!options.null_padding) { error << "* Enable null padding - //(null_padding=true) to replace missing values with NULL" << '\n'; - // } - // if (!options.ignore_errors) { - // error << "* Enable ignore errors (ignore_errors=true) to skip this row" << '\n'; - // } - // error << '\n'; - // // What were the options - // error << options.ToString(); - // return CSVError(error.str(), CSVErrorType::INCORRECT_COLUMN_AMOUNT, error_info); } CSVError CSVError::InvalidUTF8(const CSVReaderOptions &options, idx_t current_column, LinesPerBoundary error_info, @@ -213,8 +218,10 @@ CSVError CSVError::InvalidUTF8(const CSVReaderOptions &options, idx_t current_co std::ostringstream error; // How many columns were expected and how many were found error << "Invalid unicode (byte sequence mismatch) detected." << '\n'; + std::ostringstream how_to_fix_it; + how_to_fix_it << "Possible Solution: Enable ignore errors (ignore_errors=true) to skip this row" << '\n'; return CSVError(error.str(), CSVErrorType::INVALID_UNICODE, current_column, csv_row, error_info, row_byte_position, - byte_position, options); + byte_position, options, how_to_fix_it.str()); } bool CSVErrorHandler::PrintLineNumber(CSVError &error) { diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index 1ffc21423344..c0f556ffba32 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -52,7 +52,8 @@ class CSVError { public: CSVError() {}; CSVError(string error_message, CSVErrorType type, idx_t column_idx, string csv_row, LinesPerBoundary error_info, - idx_t row_byte_position, int64_t byte_position, const CSVReaderOptions &reader_options); + idx_t row_byte_position, int64_t byte_position, const CSVReaderOptions &reader_options, + const string &fixes); CSVError(string error_message, CSVErrorType type, LinesPerBoundary error_info); //! Produces error messages for column name -> type mismatch. static CSVError ColumnTypesError(case_insensitive_map_t sql_types_per_column, const vector &names); @@ -84,8 +85,11 @@ class CSVError { //! Actual error message string error_message; - //! Actual error message - string error_message_with_options; + //! Full error message used in throws + //! 1. The Actual error + //! 2. How to fix it + //! 3. Options that generated the error + string full_error_message; //! Error Type CSVErrorType type; //! Column Index where error happened From 0c7008b3c0d74817b0bda3c2354ed082a546202f Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 3 Apr 2024 11:30:36 +0200 Subject: [PATCH 062/147] Fix small issue with newline nullpadding in parallel --- .../csv_scanner/scanner/string_value_scanner.cpp | 11 ++++++----- src/execution/operator/csv_scanner/util/csv_error.cpp | 1 - 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 448dc8382c4d..07bf849e4a9c 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -405,9 +405,8 @@ void StringValueResult::QuotedNewLine(StringValueResult &result) { void StringValueResult::NullPaddingQuotedNewlineCheck() { // We do some checks for null_padding correctness - if (state_machine.options.null_padding && iterator.IsBoundarySet() && quoted_new_line && iterator.done) { - // If we have null_padding set, we found a quoted new line, we are scanning the file in parallel, and it's the - // last row of this thread. + if (state_machine.options.null_padding && iterator.IsBoundarySet() && quoted_new_line) { + // If we have null_padding set, we found a quoted new line, we are scanning the file in parallel; We error. LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); auto csv_error = CSVError::NullPaddingFail(state_machine.options, lines_per_batch); error_handler.Error(csv_error); @@ -1035,14 +1034,16 @@ bool StringValueScanner::MoveToNextBuffer() { // And an extra empty value to represent what comes after the delimiter result.AddRow(result, previous_buffer_handle->actual_size); lines_read++; - } else if (states.IsQuotedCurrent()) { + } + else if (states.IsQuotedCurrent()) { // Unterminated quote LinePosition current_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size}; result.current_line_position.begin = result.current_line_position.end; result.current_line_position.end = current_line_start; result.InvalidState(result); - } else { + } + else { result.AddRow(result, previous_buffer_handle->actual_size); lines_read++; } diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index dbe43899e6e2..d8bafa91e72d 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -182,7 +182,6 @@ CSVError CSVError::UnterminatedQuotesError(const CSVReaderOptions &options, idx_ int64_t byte_position) { std::ostringstream error; error << "Value with unterminated quote found." << '\n'; - error << '\n'; std::ostringstream how_to_fix_it; how_to_fix_it << "Possible Solution: Enable ignore errors (ignore_errors=true) to skip this row" << '\n'; return CSVError(error.str(), CSVErrorType::UNTERMINATED_QUOTES, current_column, csv_row, error_info, From 8f15bb7a5ac3e21ebeca91eb12aafe4717f945a8 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 3 Apr 2024 13:42:33 +0200 Subject: [PATCH 063/147] Simplify error message stored --- .../operator/csv_scanner/util/csv_error.cpp | 9 ++++++ .../operator/csv_scanner/csv_error.hpp | 3 ++ .../csv/rejects/csv_rejects_flush_cast.test | 30 +++++-------------- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index d8bafa91e72d..d9119a4dd9d3 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -1,5 +1,7 @@ #include "duckdb/execution/operator/csv_scanner/csv_error.hpp" #include "duckdb/common/exception/conversion_exception.hpp" +#include "duckdb/common/string_util.hpp" + #include namespace duckdb { @@ -87,6 +89,9 @@ CSVError::CSVError(string error_message_p, CSVErrorType type_p, idx_t column_idx error_info(error_info_p), row_byte_position(row_byte_position), byte_position(byte_position_p) { // What were the options std::ostringstream error; + if (reader_options.ignore_errors.GetValue()){ + RemoveNewLine(error_message); + } error << error_message << '\n'; error << fixes << '\n'; error << reader_options.ToString(); @@ -114,6 +119,10 @@ CSVError CSVError::ColumnTypesError(case_insensitive_map_t sql_types_per_ return CSVError(exception, CSVErrorType::COLUMN_NAME_TYPE_MISMATCH, {}); } +void CSVError::RemoveNewLine(string &error){ + error = StringUtil::Split(error, "\n")[0]; +} + CSVError CSVError::CastError(const CSVReaderOptions &options, string &column_name, string &cast_error, idx_t column_idx, string &csv_row, LinesPerBoundary error_info, idx_t row_byte_position, int64_t byte_position, LogicalTypeId type) { diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp index c0f556ffba32..340c42cd1c40 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_error.hpp @@ -83,6 +83,9 @@ class CSVError { return error_info.boundary_idx; } + //! We might want to remove newline in errors if we are doing them for the rejects tables + void RemoveNewLine(string &error); + //! Actual error message string error_message; //! Full error message used in throws diff --git a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test index 69530026555e..7af3c29f373d 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test +++ b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test @@ -11,32 +11,16 @@ query III SELECT typeof(first(a)), typeof(first(b)), COUNT(*) FROM read_csv( 'data/csv/error/flush_cast.csv', columns = {'a': 'DATE', 'b': 'VARCHAR'}, - rejects_table='csv_rejects_table', + store_rejects = true, delim = ',', - dateformat = '%d-%m-%Y', - ignore_errors=true); + dateformat = '%d-%m-%Y'); ---- DATE VARCHAR 2811 - -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table order by all; ----- -data/csv/error/flush_cast.csv 2813 1 "a" CAST c, bla 44971 -data/csv/error/flush_cast.csv 439 1 "a" CAST B, bla 6996 - -query I -SELECT error_message -FROM csv_rejects_table where byte_position = 6996; ----- -:.*Could not parse string "B" according to format specifier "%d-%m-%Y".* - -query I -SELECT error_message -FROM csv_rejects_table where byte_position = 44971; +query IIIIIIIIII +SELECT * +FROM reject_errors order by all; ---- -:.*Could not parse string "c" according to format specifier "%d-%m-%Y".* +3 0 439 6997 NULL 1 a CAST B, bla Error when converting column "a". Could not parse string "B" according to format specifier "%d-%m-%Y" +3 0 2813 44972 NULL 1 a CAST c, bla Error when converting column "a". Could not parse string "c" according to format specifier "%d-%m-%Y" -statement ok -DROP TABLE csv_rejects_table; \ No newline at end of file From 00efd83289e8d51a14f171fd07b3815c1f116aa0 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 3 Apr 2024 15:06:04 +0200 Subject: [PATCH 064/147] Doing some extra checks to give the correct byte-position where errors happen --- .../scanner/string_value_scanner.cpp | 102 +++++++++++++----- .../table_function/global_csv_state.cpp | 2 +- .../operator/csv_scanner/util/csv_error.cpp | 4 +- .../csv_scanner/string_value_scanner.hpp | 7 +- .../csv/rejects/csv_buffer_size_rejects.test | 46 +++----- 5 files changed, 98 insertions(+), 63 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 07bf849e4a9c..4a00f957e300 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -339,10 +339,17 @@ void StringValueResult::HandleUnicodeError(idx_t col_idx, LinePosition &error_po Utf8Proc::MakeValid(&char_array[0], char_array.size()); borked_line = {char_array.begin(), char_array.end() - 1}; LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); - auto csv_error = CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl), - error_position.GetGlobalPosition(requested_size, first_nl)); - error_handler.Error(csv_error, true); + if (current_line_position.begin == error_position) { + auto csv_error = CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + error_position.GetGlobalPosition(requested_size, first_nl)); + error_handler.Error(csv_error, true); + } else { + auto csv_error = CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + error_position.GetGlobalPosition(requested_size)); + error_handler.Error(csv_error, true); + } } bool StringValueResult::HandleError() { @@ -357,10 +364,17 @@ bool StringValueResult::HandleError() { switch (cur_error.type) { case CSVErrorType::TOO_MANY_COLUMNS: - csv_error = CSVError::IncorrectColumnAmountError( - state_machine.options, col_idx, lines_per_batch, borked_line, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl), - line_pos.GetGlobalPosition(requested_size, first_nl)); + if (current_line_position.begin == line_pos) { + csv_error = CSVError::IncorrectColumnAmountError( + state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + line_pos.GetGlobalPosition(requested_size, first_nl)); + } else { + csv_error = CSVError::IncorrectColumnAmountError( + state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + line_pos.GetGlobalPosition(requested_size)); + } break; case CSVErrorType::INVALID_UNICODE: { // We have to sanitize the CSV line @@ -368,22 +382,47 @@ bool StringValueResult::HandleError() { char_array.push_back('\0'); // Null-terminate the character array Utf8Proc::MakeValid(&char_array[0], char_array.size()); borked_line = {char_array.begin(), char_array.end() - 1}; - csv_error = CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl), - line_pos.GetGlobalPosition(requested_size, first_nl)); + if (current_line_position.begin == line_pos) { + csv_error = + CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + line_pos.GetGlobalPosition(requested_size, first_nl)); + } else { + csv_error = + CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + line_pos.GetGlobalPosition(requested_size)); + } break; } case CSVErrorType::UNTERMINATED_QUOTES: - csv_error = CSVError::UnterminatedQuotesError( - state_machine.options, col_idx, lines_per_batch, borked_line, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl), - line_pos.GetGlobalPosition(requested_size, first_nl)); + if (current_line_position.begin == line_pos) { + csv_error = CSVError::UnterminatedQuotesError( + state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + line_pos.GetGlobalPosition(requested_size, first_nl)); + } else { + csv_error = CSVError::UnterminatedQuotesError( + state_machine.options, col_idx, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + line_pos.GetGlobalPosition(requested_size)); + } break; case CSVErrorType::CAST_ERROR: - csv_error = CSVError::CastError( - state_machine.options, names[cur_error.col_idx], cur_error.error_message, cur_error.col_idx, - borked_line, lines_per_batch, current_line_position.begin.GetGlobalPosition(requested_size, first_nl), - line_pos.GetGlobalPosition(requested_size, first_nl), parse_types[cur_error.col_idx].first); + if (current_line_position.begin == line_pos) { + csv_error = CSVError::CastError( + state_machine.options, names[cur_error.col_idx], cur_error.error_message, cur_error.col_idx, + borked_line, lines_per_batch, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + line_pos.GetGlobalPosition(requested_size, first_nl), parse_types[cur_error.col_idx].first); + } else { + csv_error = CSVError::CastError( + state_machine.options, names[cur_error.col_idx], cur_error.error_message, cur_error.col_idx, + borked_line, lines_per_batch, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + line_pos.GetGlobalPosition(requested_size), parse_types[cur_error.col_idx].first); + } + break; default: throw InvalidInputException("CSV Error not allowed when inserting row"); @@ -499,11 +538,20 @@ bool StringValueResult::AddRowInternal() { bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); - auto csv_error = CSVError::IncorrectColumnAmountError( - state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl), - last_position.GetGlobalPosition(requested_size, first_nl)); - error_handler.Error(csv_error); + if (current_line_position.begin == last_position) { + auto csv_error = CSVError::IncorrectColumnAmountError( + state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + last_position.GetGlobalPosition(requested_size, first_nl)); + error_handler.Error(csv_error); + } else { + auto csv_error = CSVError::IncorrectColumnAmountError( + state_machine.options, cur_col_id - 1, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl), + last_position.GetGlobalPosition(requested_size)); + error_handler.Error(csv_error); + } + // If we are here we ignore_errors, so we delete this line number_of_rows--; } @@ -1034,16 +1082,14 @@ bool StringValueScanner::MoveToNextBuffer() { // And an extra empty value to represent what comes after the delimiter result.AddRow(result, previous_buffer_handle->actual_size); lines_read++; - } - else if (states.IsQuotedCurrent()) { + } else if (states.IsQuotedCurrent()) { // Unterminated quote LinePosition current_line_start = {iterator.pos.buffer_idx, iterator.pos.buffer_pos, result.buffer_size}; result.current_line_position.begin = result.current_line_position.end; result.current_line_position.end = current_line_start; result.InvalidState(result); - } - else { + } else { result.AddRow(result, previous_buffer_handle->actual_size); lines_read++; } diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 3ad83d0954f7..b16c2d48df79 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -279,7 +279,7 @@ void CSVGlobalState::FillRejectsTable() { // a null errors_appender.Append(Value()); } else { - errors_appender.Append(error.byte_position); + errors_appender.Append(error.byte_position + 1); } // 6. Column Index errors_appender.Append(col_idx + 1); diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index d9119a4dd9d3..18a37fa3f2ff 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -89,7 +89,7 @@ CSVError::CSVError(string error_message_p, CSVErrorType type_p, idx_t column_idx error_info(error_info_p), row_byte_position(row_byte_position), byte_position(byte_position_p) { // What were the options std::ostringstream error; - if (reader_options.ignore_errors.GetValue()){ + if (reader_options.ignore_errors.GetValue()) { RemoveNewLine(error_message); } error << error_message << '\n'; @@ -119,7 +119,7 @@ CSVError CSVError::ColumnTypesError(case_insensitive_map_t sql_types_per_ return CSVError(exception, CSVErrorType::COLUMN_NAME_TYPE_MISMATCH, {}); } -void CSVError::RemoveNewLine(string &error){ +void CSVError::RemoveNewLine(string &error) { error = StringUtil::Split(error, "\n")[0]; } diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 6332906f03ca..0039f9ade5b0 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -42,7 +42,12 @@ class LinePosition { } return other.buffer_size - other.buffer_pos + buffer_pos; } - idx_t GetGlobalPosition(idx_t requested_buffer_size, bool first_char_nl) { + + bool operator==(const LinePosition &other) const { + return buffer_pos == other.buffer_pos && buffer_idx == other.buffer_idx && buffer_size == other.buffer_size; + } + + idx_t GetGlobalPosition(idx_t requested_buffer_size, bool first_char_nl = false) { return requested_buffer_size * buffer_idx + buffer_pos + first_char_nl; } idx_t buffer_pos = 0; diff --git a/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test index 76b95cfbe731..35f44da755a0 100644 --- a/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test +++ b/test/sql/copy/csv/rejects/csv_buffer_size_rejects.test @@ -7,7 +7,7 @@ require skip_reload # Test will fail on windows because byte_position is slightly different due to \r\n instead of \n require notwindows -loop buffer_size 5 10 +loop buffer_size 5 8 # Ensure that we can get the schema if we reduce the sample size and ignore errors query IIIII @@ -15,45 +15,29 @@ SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), M 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', sample_size=1, buffer_size=${buffer_size}, - rejects_table='csv_rejects_table', - ignore_errors=true); + store_rejects = true); ---- BIGINT VARCHAR 11044 11044 2 -query IIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIIII rowsort +SELECT * EXCLUDE (scan_id, user_arguments) FROM reject_scans order by all; ---- -test/sql/copy/csv/data/error/mismatch/big_bad.csv 1 "column0" CAST B, A 10875 -test/sql/copy/csv/data/error/mismatch/big_bad.csv 1 "column0" CAST C, A 20875 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 1 "column0" CAST B, A 18395 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 1 "column0" CAST C, A 28395 - -query I -SELECT error_message -FROM csv_rejects_table where byte_position = 10875; ----- -:.*Could not convert string "B" to 'BIGINT'.* +0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 false {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL +1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 false {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL -query I -SELECT error_message -FROM csv_rejects_table where byte_position = 20875; ----- -:.*Could not convert string "C" to 'BIGINT'.* -query I -SELECT error_message -FROM csv_rejects_table where byte_position = 18395; +query IIIIIIIII rowsort +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -:.*Could not convert string "B" to 'BIGINT'.* +0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -query I -SELECT error_message -FROM csv_rejects_table where byte_position = 28395; ----- -:.*Could not convert string "C" to 'BIGINT'.* +statement ok +DROP TABLE reject_errors; statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_scans; endloop \ No newline at end of file From eca3caf1404d6aa75d8e3beff9ed212594373646 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 3 Apr 2024 15:42:30 +0200 Subject: [PATCH 065/147] Incorrect Column Amount --- .../scanner/string_value_scanner.cpp | 1 + .../operator/csv_scanner/util/csv_error.cpp | 4 +- .../csv_incorrect_columns_amount_rejects.test | 125 ++++++++++-------- 3 files changed, 71 insertions(+), 59 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 4a00f957e300..d3e980a57392 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -121,6 +121,7 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size // We error pointing to the current value error. current_errors.push_back({CSVErrorType::TOO_MANY_COLUMNS, cur_col_id, last_position}); } + cur_col_id++; return; } diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index 18a37fa3f2ff..da9f5d5e2435 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -214,10 +214,10 @@ CSVError CSVError::IncorrectColumnAmountError(const CSVReaderOptions &options, i error << "Expected Number of Columns: " << options.dialect_options.num_cols << " Found: " << actual_columns + 1; if (actual_columns >= options.dialect_options.num_cols) { return CSVError(error.str(), CSVErrorType::TOO_MANY_COLUMNS, actual_columns, csv_row, error_info, - row_byte_position, byte_position, options, how_to_fix_it.str()); + row_byte_position, byte_position - 1, options, how_to_fix_it.str()); } else { return CSVError(error.str(), CSVErrorType::TOO_FEW_COLUMNS, actual_columns, csv_row, error_info, - row_byte_position, byte_position, options, how_to_fix_it.str()); + row_byte_position, byte_position - 1, options, how_to_fix_it.str()); } } diff --git a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test index 070b413a8497..2b59e17547d3 100644 --- a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test +++ b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test @@ -11,104 +11,115 @@ statement ok SELECT * FROM read_csv( 'data/csv/rejects/incorrect_columns/few_columns.csv', columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1); + store_rejects=true, auto_detect=false, header = 1); -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII rowsort +FROM reject_errors order by all; ---- -data/csv/rejects/incorrect_columns/few_columns.csv 1814 3 "d" MISSING COLUMNS 1,2,3 14504 -data/csv/rejects/incorrect_columns/few_columns.csv 1823 1 "b" MISSING COLUMNS 1 14574 -data/csv/rejects/incorrect_columns/few_columns.csv 2378 1 "b" MISSING COLUMNS 1 19008 -data/csv/rejects/incorrect_columns/few_columns.csv 2762 2 "c" MISSING COLUMNS 1,2 22074 +3 0 1814 14505 14510 3 d MISSING COLUMNS 1,2,3 Expected Number of Columns: 4 Found: 3 +3 0 1823 14575 14576 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +3 0 2378 19009 19010 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +3 0 2762 22075 22078 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; statement ok SELECT * FROM read_csv( 'data/csv/rejects/incorrect_columns/many_columns.csv', columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1); + store_rejects=true, auto_detect=false, header = 1); -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII rowsort +FROM reject_errors order by all; ---- -data/csv/rejects/incorrect_columns/many_columns.csv 1096 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 8760 -data/csv/rejects/incorrect_columns/many_columns.csv 1159 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 9268 -data/csv/rejects/incorrect_columns/many_columns.csv 1206 5 NULL TOO MANY COLUMNS 1,2,3,4,5 9648 -data/csv/rejects/incorrect_columns/many_columns.csv 2769 5 NULL TOO MANY COLUMNS 1,2,3,4,5 22154 +7 0 1096 8761 8768 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +7 0 1096 8761 8770 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +7 0 1159 9269 9276 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +7 0 1159 9269 9278 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +7 0 1206 9649 9656 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 +7 0 2769 22155 22162 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; statement ok SELECT * FROM read_csv( 'data/csv/rejects/incorrect_columns/mix_columns.csv', columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1); + store_rejects=true, auto_detect=false, header = 1); -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII rowsort +FROM reject_errors order by all; ---- -data/csv/rejects/incorrect_columns/mix_columns.csv 1604 1 "b" MISSING COLUMNS 1 12824 -data/csv/rejects/incorrect_columns/mix_columns.csv 1671 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 13354 -data/csv/rejects/incorrect_columns/mix_columns.csv 2751 2 "c" MISSING COLUMNS 1,2 21998 -data/csv/rejects/incorrect_columns/mix_columns.csv 2768 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 22130 +11 0 1604 12825 12826 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +11 0 1671 13355 13362 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +11 0 1671 13355 13364 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +11 0 2751 21999 22002 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 +11 0 2768 22131 22138 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +11 0 2768 22131 22140 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 + # Different Buffer Sizes loop buffer_size 10 15 statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; statement ok SELECT * FROM read_csv( 'data/csv/rejects/incorrect_columns/small_mix.csv', columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1); + store_rejects=true, auto_detect=false, header = 1); -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIII rowsort +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all ---- -data/csv/rejects/incorrect_columns/small_mix.csv 3 5 NULL TOO MANY COLUMNS 1,2,3,4,5 16 -data/csv/rejects/incorrect_columns/small_mix.csv 4 3 "d" MISSING COLUMNS 1,2,3 26 +0 3 17 24 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 +0 4 27 32 3 d MISSING COLUMNS 1,2,3 Expected Number of Columns: 4 Found: 3 endloop # All files statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; statement ok SELECT * FROM read_csv( 'data/csv/rejects/incorrect_columns/*.csv', columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1); + store_rejects=true, auto_detect=false, header = 1); -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII rowsort +FROM reject_errors order by all ---- -data/csv/rejects/incorrect_columns/few_columns.csv 1814 3 "d" MISSING COLUMNS 1,2,3 14504 -data/csv/rejects/incorrect_columns/few_columns.csv 1823 1 "b" MISSING COLUMNS 1 14574 -data/csv/rejects/incorrect_columns/few_columns.csv 2378 1 "b" MISSING COLUMNS 1 19008 -data/csv/rejects/incorrect_columns/few_columns.csv 2762 2 "c" MISSING COLUMNS 1,2 22074 -data/csv/rejects/incorrect_columns/many_columns.csv 1096 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 8760 -data/csv/rejects/incorrect_columns/many_columns.csv 1159 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 9268 -data/csv/rejects/incorrect_columns/many_columns.csv 1206 5 NULL TOO MANY COLUMNS 1,2,3,4,5 9648 -data/csv/rejects/incorrect_columns/many_columns.csv 2769 5 NULL TOO MANY COLUMNS 1,2,3,4,5 22154 -data/csv/rejects/incorrect_columns/mix_columns.csv 1604 1 "b" MISSING COLUMNS 1 12824 -data/csv/rejects/incorrect_columns/mix_columns.csv 1671 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 13354 -data/csv/rejects/incorrect_columns/mix_columns.csv 2751 2 "c" MISSING COLUMNS 1,2 21998 -data/csv/rejects/incorrect_columns/mix_columns.csv 2768 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 22130 -data/csv/rejects/incorrect_columns/small_mix.csv 3 5 NULL TOO MANY COLUMNS 1,2,3,4,5 16 -data/csv/rejects/incorrect_columns/small_mix.csv 4 3 "d" MISSING COLUMNS 1,2,3 26 +35 0 1814 14505 14510 3 d MISSING COLUMNS 1,2,3 Expected Number of Columns: 4 Found: 3 +35 0 1823 14575 14576 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +35 0 2378 19009 19010 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +35 0 2762 22075 22078 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 +35 1 1096 8761 8768 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +35 1 1096 8761 8770 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +35 1 1159 9269 9276 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +35 1 1159 9269 9278 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +35 1 1206 9649 9656 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 +35 1 2769 22155 22162 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 +35 2 1604 12825 12826 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +35 2 1671 13355 13362 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +35 2 1671 13355 13364 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +35 2 2751 21999 22002 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 +35 2 2768 22131 22138 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +35 2 2768 22131 22140 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +35 3 3 17 24 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 +35 3 4 27 32 3 d MISSING COLUMNS 1,2,3 Expected Number of Columns: 4 Found: 3 From 50d658c8cde7b5208a1b56feedd0ff3d9a4759dd Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 3 Apr 2024 16:43:43 +0200 Subject: [PATCH 066/147] Don't really care about copy from yet --- .../copy/csv/rejects/csv_rejects_auto.test | 123 +++--------------- 1 file changed, 15 insertions(+), 108 deletions(-) diff --git a/test/sql/copy/csv/rejects/csv_rejects_auto.test b/test/sql/copy/csv/rejects/csv_rejects_auto.test index bfa8073a6567..e673e9917287 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_auto.test +++ b/test/sql/copy/csv/rejects/csv_rejects_auto.test @@ -11,89 +11,44 @@ query IIIII SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', sample_size=1, - rejects_table='csv_rejects_table', - ignore_errors=true); + store_rejects=true); ---- BIGINT VARCHAR 11044 11044 2 -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII rowsort +FROM reject_errors order by all; ---- -test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 1 "column0" CAST B, A 10875 -test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 1 "column0" CAST C, A 20875 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 1 "column0" CAST B, A 18395 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 1 "column0" CAST C, A 28395 +3 0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +3 0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +3 1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +3 1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -query I -SELECT error_message -FROM csv_rejects_table where line=2176 and column_idx=1; ----- -:.*Could not convert string "B" to 'BIGINT'.* - -query I -SELECT error_message -FROM csv_rejects_table where line=4176 and column_idx=1; ----- -:.*Could not convert string "C" to 'BIGINT'.* - -query I -SELECT error_message -FROM csv_rejects_table where line=3680 and column_idx=1; ----- -:.*Could not convert string "B" to 'BIGINT'.* - -query I -SELECT error_message -FROM csv_rejects_table where line=5680 and column_idx=1; ----- -:.*Could not convert string "C" to 'BIGINT'.* +statement ok +DROP TABLE reject_errors; statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_scans; # Test with lots of errors query I SELECT SUM(num) FROM read_csv_auto( 'test/sql/copy/csv/data/error/mismatch/half1.csv', header=true, - ignore_errors=true, sample_size=1, - rejects_table='csv_rejects_table') ----- -2464 - -query I -SELECT COUNT(*) FROM csv_rejects_table; ----- -1024 - -statement ok -DROP TABLE csv_rejects_table; - -# Test same with COPY -statement ok -CREATE TABLE tbl1 (col1 BIGINT, col2 VARCHAR); - -statement ok -COPY tbl1 FROM 'test/sql/copy/csv/data/error/mismatch/half1.csv' -WITH (HEADER, IGNORE_ERRORS TRUE, SAMPLE_SIZE 1000, REJECTS_TABLE 'csv_rejects_table'); - -query I -SELECT SUM(col1) FROM tbl1; + store_rejects=true) ---- 2464 query I -SELECT COUNT(*) FROM csv_rejects_table; +SELECT COUNT(*) FROM reject_errors; ---- 1024 statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; statement ok -DROP TABLE tbl1; +DROP TABLE reject_scans; # Test with more errors than STANDARD_VECTOR_SIZE query I @@ -112,52 +67,4 @@ SELECT COUNT(*) FROM csv_rejects_table; 3072 statement ok -DROP TABLE csv_rejects_table; - -statement ok -CREATE TABLE tbl1 (col1 BIGINT, col2 VARCHAR); - -statement ok -COPY tbl1 FROM 'test/sql/copy/csv/data/error/mismatch/half2.csv' -WITH (HEADER, IGNORE_ERRORS TRUE, SAMPLE_SIZE 1000, REJECTS_TABLE 'csv_rejects_table'); - -query I -SELECT SUM(col1) FROM tbl1; ----- -2542 - -query I -SELECT COUNT(*) FROM csv_rejects_table; ----- -3072 - -statement ok -DROP TABLE csv_rejects_table; - -statement ok -DROP TABLE tbl1; - -# Test with more errors than STANDARD_VECTOR_SIZE and limit -statement ok -CREATE TABLE tbl1 (col1 BIGINT, col2 VARCHAR); - -statement ok -COPY tbl1 FROM 'test/sql/copy/csv/data/error/mismatch/half2.csv' -WITH (HEADER, IGNORE_ERRORS TRUE, SAMPLE_SIZE 1000, REJECTS_TABLE 'csv_rejects_table', REJECTS_LIMIT 1337); - -query I -SELECT SUM(col1) FROM tbl1; ----- -2542 - -query I -SELECT COUNT(*) FROM csv_rejects_table; ----- -1337 - -statement ok -DROP TABLE csv_rejects_table; - -statement ok -DROP TABLE tbl1; - +DROP TABLE csv_rejects_table; \ No newline at end of file From 8b97a738ab147e76ea5d927f583b46d684019ddc Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 3 Apr 2024 16:55:51 +0200 Subject: [PATCH 067/147] More test fixes --- .../csv/rejects/csv_rejects_flush_cast.test | 1 - .../csv/rejects/csv_rejects_maximum_line.test | 71 ++++++++++--------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test index 7af3c29f373d..e6459aa5cd77 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test +++ b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test @@ -18,7 +18,6 @@ SELECT typeof(first(a)), typeof(first(b)), COUNT(*) FROM read_csv( DATE VARCHAR 2811 query IIIIIIIIII -SELECT * FROM reject_errors order by all; ---- 3 0 439 6997 NULL 1 a CAST B, bla Error when converting column "a". Could not parse string "B" according to format specifier "%d-%m-%Y" diff --git a/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test b/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test index f6214aab0906..1095a90d70f8 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test +++ b/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test @@ -11,17 +11,18 @@ statement ok SELECT * FROM read_csv( 'data/csv/rejects/maximum_line/max_10.csv', columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1, max_line_size=10); + store_rejects=true, auto_detect=false, header = 1, max_line_size=10); -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII +FROM reject_errors order by all; ---- -data/csv/rejects/maximum_line/max_10.csv 5 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 +3 0 5 23 23 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; # Test with buffer sizes @@ -31,17 +32,18 @@ statement ok SELECT * FROM read_csv( 'data/csv/rejects/maximum_line/max_10.csv', columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1, max_line_size=10, buffer_size=${buffer_size}); + store_rejects = true, auto_detect=false, header = 1, max_line_size=10, buffer_size=${buffer_size}); -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -data/csv/rejects/maximum_line/max_10.csv 5 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 +0 5 23 23 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; endloop @@ -50,37 +52,38 @@ statement ok SELECT * FROM read_csv( 'data/csv/rejects/maximum_line/over_vector.csv', columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1, max_line_size=20); + store_rejects = true, auto_detect=false, header = 1, max_line_size=20); -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII +FROM reject_errors order by all; ---- -data/csv/rejects/maximum_line/over_vector.csv 2282 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 13684 -data/csv/rejects/maximum_line/over_vector.csv 2591 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 15557 -data/csv/rejects/maximum_line/over_vector.csv 2923 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 17568 +27 0 2282 13685 13685 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. +27 0 2591 15558 15558 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. +27 0 2923 17569 17569 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; -# Read Multiple Files +statement ok +DROP TABLE reject_scans; +# Read Multiple Files statement ok SELECT * FROM read_csv( 'data/csv/rejects/maximum_line/*.csv', columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1, max_line_size=10); + store_rejects = true, auto_detect=false, header = 1, max_line_size=10); -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII +FROM reject_errors order by all; ---- -data/csv/rejects/maximum_line/max_10.csv 5 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 22 -data/csv/rejects/maximum_line/over_vector.csv 2282 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 13684 -data/csv/rejects/maximum_line/over_vector.csv 2591 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 15557 -data/csv/rejects/maximum_line/over_vector.csv 2923 1 "a" LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 17568 +31 0 5 23 23 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. +31 1 2282 13685 13685 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. +31 1 2591 15558 15558 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. +31 1 2923 17569 17569 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. + +statement ok +DROP TABLE reject_errors; statement ok -DROP TABLE csv_rejects_table; \ No newline at end of file +DROP TABLE reject_scans; \ No newline at end of file From 3db603c364c44af68fcdcfe97f60d1544072ea4c Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 4 Apr 2024 12:08:05 +0200 Subject: [PATCH 068/147] Fixing rejects_read to new model, properly increment scan_id --- .../scanner/string_value_scanner.cpp | 3 +- .../table_function/global_csv_state.cpp | 3 +- .../transaction/transaction_context.hpp | 2 + src/transaction/transaction_context.cpp | 6 + .../copy/csv/rejects/csv_rejects_read.test | 268 ++++++------------ 5 files changed, 89 insertions(+), 193 deletions(-) diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index d3e980a57392..dd7c4364fb9d 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -120,8 +120,8 @@ void StringValueResult::AddValueToVector(const char *value_ptr, const idx_t size if (error) { // We error pointing to the current value error. current_errors.push_back({CSVErrorType::TOO_MANY_COLUMNS, cur_col_id, last_position}); + cur_col_id++; } - cur_col_id++; return; } @@ -552,7 +552,6 @@ bool StringValueResult::AddRowInternal() { last_position.GetGlobalPosition(requested_size)); error_handler.Error(csv_error); } - // If we are here we ignore_errors, so we delete this line number_of_rows--; } diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index b16c2d48df79..6646f4ab98b8 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -245,8 +245,8 @@ void CSVGlobalState::FillRejectsTable() { InternalAppender errors_appender(context, errors_table); InternalAppender scans_appender(context, scans_table); idx_t scan_idx = context.transaction.GetActiveQuery(); - idx_t file_idx = 0; for (auto &file : file_scans) { + idx_t file_idx = context.transaction.GetIncrementalIndex(); auto file_name = file->file_path; auto &errors = file->error_handler->errors; // We first insert the file into the file scans table @@ -309,7 +309,6 @@ void CSVGlobalState::FillRejectsTable() { rejects->count = 0; FillScanErrorTable(scans_appender, scan_idx, file_idx, *file); } - file_idx++; } errors_appender.Close(); scans_appender.Close(); diff --git a/src/include/duckdb/transaction/transaction_context.hpp b/src/include/duckdb/transaction/transaction_context.hpp index b0a50103bb46..b265c0131498 100644 --- a/src/include/duckdb/transaction/transaction_context.hpp +++ b/src/include/duckdb/transaction/transaction_context.hpp @@ -48,6 +48,7 @@ class TransactionContext { } idx_t GetActiveQuery(); + idx_t GetIncrementalIndex(); void ResetActiveQuery(); void SetActiveQuery(transaction_t query_number); @@ -56,6 +57,7 @@ class TransactionContext { bool auto_commit; unique_ptr current_transaction; + idx_t incremental_index = 0; TransactionContext(const TransactionContext &) = delete; }; diff --git a/src/transaction/transaction_context.cpp b/src/transaction/transaction_context.cpp index 7185a263894b..82d1fa43094f 100644 --- a/src/transaction/transaction_context.cpp +++ b/src/transaction/transaction_context.cpp @@ -89,13 +89,19 @@ idx_t TransactionContext::GetActiveQuery() { return current_transaction->GetActiveQuery(); } +idx_t TransactionContext::GetIncrementalIndex() { + return incremental_index++; +} + void TransactionContext::ResetActiveQuery() { + incremental_index = 0; if (current_transaction) { SetActiveQuery(MAXIMUM_QUERY_ID); } } void TransactionContext::SetActiveQuery(transaction_t query_number) { + incremental_index = 0; if (!current_transaction) { throw InternalException("SetActiveQuery called without active transaction"); } diff --git a/test/sql/copy/csv/rejects/csv_rejects_read.test b/test/sql/copy/csv/rejects/csv_rejects_read.test index 9917965558ba..b537833fd7dd 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_read.test +++ b/test/sql/copy/csv/rejects/csv_rejects_read.test @@ -10,101 +10,67 @@ query III rowsort SELECT * FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/bad.csv', columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false); + store_rejects = true, auto_detect=true); ---- 1 2 AAA 6 7 CCC -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII +FROM reject_errors; ---- -test/sql/copy/csv/data/error/mismatch/bad.csv 2 2 "col1" CAST 4,BBB,9, 9 +3 0 2 10 12 2 col1 CAST 4,BBB,9, Error when converting column "col1". Could not convert string "BBB" to 'INTEGER' -query I -SELECT error_message -FROM csv_rejects_table; ----- -:.*Could not convert string "BBB" to 'INTEGER'.* +statement ok +DROP TABLE reject_errors; statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_scans; # Test with multiple columns on the same row query III rowsort SELECT * FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/bad2.csv', columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'INTEGER'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false); + store_rejects = true, auto_detect=false); ---- 4 5 9 -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; ----- -test/sql/copy/csv/data/error/mismatch/bad2.csv 1 3 "col2" CAST 1,2,DDD, 0 -test/sql/copy/csv/data/error/mismatch/bad2.csv 3 1 "col0" CAST EEE,7,FFF, 16 -test/sql/copy/csv/data/error/mismatch/bad2.csv 3 3 "col2" CAST EEE,7,FFF, 16 - -query I -SELECT error_message -FROM csv_rejects_table where line=1 and column_idx=3; +query IIIIIIIIII +FROM reject_errors ORDER BY ALL; ---- -:.*Could not convert string "DDD" to 'INTEGER'.* +7 0 1 1 5 3 col2 CAST 1,2,DDD, Error when converting column "col2". Could not convert string "DDD" to 'INTEGER' +7 0 3 17 17 1 col0 CAST EEE,7,FFF, Error when converting column "col0". Could not convert string "EEE" to 'INTEGER' +7 0 3 17 23 3 col2 CAST EEE,7,FFF, Error when converting column "col2". Could not convert string "FFF" to 'INTEGER' -query I -SELECT error_message -FROM csv_rejects_table where line=3 and column_idx=1; ----- -:.*Could not convert string "EEE" to 'INTEGER'.* - -query I -SELECT error_message -FROM csv_rejects_table where line=3 and column_idx=3; ----- -:.*Could not convert string "FFF" to 'INTEGER'.* +statement ok +DROP TABLE reject_errors; statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_scans; # Test with multiple files query III rowsort SELECT * FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/bad*.csv', columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false); + store_rejects = true, auto_detect=false); ---- 1 2 AAA 1 2 DDD 4 5 9 6 7 CCC - -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII +FROM reject_errors ORDER BY ALL; ---- -test/sql/copy/csv/data/error/mismatch/bad.csv 2 2 "col1" CAST 4,BBB,9, 9 -test/sql/copy/csv/data/error/mismatch/bad2.csv 3 1 "col0" CAST EEE,7,FFF, 16 +11 0 2 10 12 2 col1 CAST 4,BBB,9, Error when converting column "col1". Could not convert string "BBB" to 'INTEGER' +11 1 3 17 17 1 col0 CAST EEE,7,FFF, Error when converting column "col0". Could not convert string "EEE" to 'INTEGER' -query I -SELECT error_message -FROM csv_rejects_table where line=2 and column_idx=2; ----- -:.*Could not convert string "BBB" to 'INTEGER'.* - -query I -SELECT error_message -FROM csv_rejects_table where line=3 and column_idx=1; ----- -:.*Could not convert string "EEE" to 'INTEGER'.* +statement ok +DROP TABLE reject_errors; statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_scans; # Set limit @@ -112,9 +78,7 @@ query III rowsort SELECT * FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/bad*.csv', columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, - rejects_table='csv_rejects_table', - rejects_limit=2, - ignore_errors=true, auto_detect=false); + store_rejects = true,rejects_limit=2, ignore_errors=true, auto_detect=false); ---- 1 2 AAA 1 2 DDD @@ -123,121 +87,80 @@ SELECT * FROM read_csv( # We should now only have two errors logged query I -SELECT COUNT(*) FROM csv_rejects_table +SELECT COUNT(*) FROM reject_errors ---- 2 statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; # Try with bigger files query I SELECT SUM(num) FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/big_bad.csv', columns = {'num': 'INTEGER', 'str': 'VARCHAR'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false); + store_rejects = true, auto_detect=false); ---- 4270 -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; ----- -test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 1 "num" CAST B, A 10875 -test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 1 "num" CAST C, A 20875 - -query I -SELECT error_message -FROM csv_rejects_table where line=2176 and column_idx=1; +query IIIIIIIIII +FROM reject_errors ORDER BY ALL; ---- -:.*Could not convert string "B" to 'INTEGER'.* +19 0 2176 10876 10876 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' +19 0 4176 20876 20876 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' -query I -SELECT error_message -FROM csv_rejects_table where line=4176 and column_idx=1; ----- -:.*Could not convert string "C" to 'INTEGER'.* +statement ok +DROP TABLE reject_errors; statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_scans; query I SELECT SUM(num) FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/big_bad2.csv', columns = {'num': 'INTEGER', 'str': 'VARCHAR'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false) + store_rejects = true, auto_detect=false) ---- 6774 -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII +FROM reject_errors ORDER BY ALL; ---- -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 1 "num" CAST B, A 18395 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 1 "num" CAST C, A 28395 +23 0 3680 18396 18396 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' +23 0 5680 28396 28396 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' -query I -SELECT error_message -FROM csv_rejects_table where line=3680 and column_idx=1; ----- -:.*Could not convert string "B" to 'INTEGER'.* -query I -SELECT error_message -FROM csv_rejects_table where line=5680 and column_idx=1; ----- -:.*Could not convert string "C" to 'INTEGER'.* +statement ok +DROP TABLE reject_errors; statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_scans; # Test with multiple big files query I SELECT SUM(num) FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/big_*.csv', columns = {'num': 'INTEGER', 'str': 'VARCHAR'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false); + store_rejects = true, auto_detect=false); ---- 11044 -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; ----- -test/sql/copy/csv/data/error/mismatch/big_bad.csv 2176 1 "num" CAST B, A 10875 -test/sql/copy/csv/data/error/mismatch/big_bad.csv 4176 1 "num" CAST C, A 20875 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 3680 1 "num" CAST B, A 18395 -test/sql/copy/csv/data/error/mismatch/big_bad2.csv 5680 1 "num" CAST C, A 28395 - -query I -SELECT error_message -FROM csv_rejects_table where line=3680 and column_idx=1; ----- -:.*Could not convert string "B" to 'INTEGER'.* - -query I -SELECT error_message -FROM csv_rejects_table where line=5680 and column_idx=1; +query IIIIIIIIII +FROM reject_errors ORDER BY ALL; ---- -:.*Could not convert string "C" to 'INTEGER'.* +27 0 2176 10876 10876 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' +27 0 4176 20876 20876 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' +27 1 3680 18396 18396 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' +27 1 5680 28396 28396 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' -query I -SELECT error_message -FROM csv_rejects_table where line=2176 and column_idx=1; ----- -:.*Could not convert string "B" to 'INTEGER'.* - -query I -SELECT error_message -FROM csv_rejects_table where line=4176 and column_idx=1; ----- -:.*Could not convert string "C" to 'INTEGER'.* +statement ok +DROP TABLE reject_errors; statement ok -DROP TABLE csv_rejects_table; +DROP TABLE reject_scans; # Test with multiple rejects table in the same query query IIII rowsort @@ -245,61 +168,36 @@ SELECT * FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/small1.csv', columns = {'num': 'INTEGER', 'str': 'VARCHAR'}, - rejects_table='csv_rejects_table_left', - ignore_errors=true) as L + store_rejects = true) as L JOIN read_csv( 'test/sql/copy/csv/data/error/mismatch/small2.csv', columns = {'num': 'INTEGER', 'str': 'VARCHAR'}, - rejects_table='csv_rejects_table_right', - ignore_errors=true) as R + store_rejects = true) as R ON L.num = R.num; ---- 1 A 1 A 3 C 3 C -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table_left; ----- -test/sql/copy/csv/data/error/mismatch/small1.csv 3 1 "num" CAST X,Y 14 -test/sql/copy/csv/data/error/mismatch/small1.csv 6 1 "num" CAST X,Y 26 - -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table_right; ----- -test/sql/copy/csv/data/error/mismatch/small2.csv 3 1 "num" CAST X,Y 14 -test/sql/copy/csv/data/error/mismatch/small2.csv 5 1 "num" CAST X,Y 22 - -query I -SELECT error_message -FROM csv_rejects_table_left where line=3 and column_idx=1; ----- -:.*Could not convert string "X" to 'INTEGER'.* - -query I -SELECT error_message -FROM csv_rejects_table_left where line=6 and column_idx=1; +query IIIIIIIIIIIII +FROM reject_scans ORDER BY ALL; ---- -:.*Could not convert string "X" to 'INTEGER'.* +31 0 test/sql/copy/csv/data/error/mismatch/small1.csv , " " \n 0 true {'num': 'INTEGER','str': 'VARCHAR'} NULL NULL store_rejects=true +31 1 test/sql/copy/csv/data/error/mismatch/small2.csv , " " \n 0 true {'num': 'INTEGER','str': 'VARCHAR'} NULL NULL store_rejects=true -query I -SELECT error_message -FROM csv_rejects_table_right where line=3 and column_idx=1; ----- -:.*Could not convert string "X" to 'INTEGER'.* -query I -SELECT error_message -FROM csv_rejects_table_right where line=5 and column_idx=1; +query IIIIIIIIII +FROM reject_errors ORDER BY ALL; ---- -:.*Could not convert string "X" to 'INTEGER'.* +31 0 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +31 0 6 27 27 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +31 1 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +31 1 5 23 23 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' statement ok -DROP TABLE csv_rejects_table_left; +DROP TABLE reject_errors; statement ok -DROP TABLE csv_rejects_table_right; +DROP TABLE reject_scans; # Test with multiple rejects table in the same query, with different limits # (only one reject should be logged in right table) @@ -308,36 +206,28 @@ SELECT * FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/small1.csv', columns = {'num': 'INTEGER', 'str': 'VARCHAR'}, - rejects_table='csv_rejects_table_left', - ignore_errors=true) as L + store_rejects = true) as L JOIN read_csv( 'test/sql/copy/csv/data/error/mismatch/small2.csv', columns = {'num': 'INTEGER', 'str': 'VARCHAR'}, - rejects_table='csv_rejects_table_right', - rejects_limit=1, - ignore_errors=true) as R + store_rejects = true, rejects_limit=1) as R ON L.num = R.num; ---- 1 A 1 A 3 C 3 C -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table_left; +query IIIIIIIIII +FROM reject_errors ORDER BY ALL; ---- -test/sql/copy/csv/data/error/mismatch/small1.csv 3 1 "num" CAST X,Y 14 -test/sql/copy/csv/data/error/mismatch/small1.csv 6 1 "num" CAST X,Y 26 +36 0 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +36 0 6 27 27 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +36 1 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' -query I -SELECT COUNT(*) -FROM csv_rejects_table_right; ----- -1 statement ok -DROP TABLE csv_rejects_table_left; +DROP TABLE reject_errors; statement ok -DROP TABLE csv_rejects_table_right; +DROP TABLE reject_scans; From 836f84d4ec456474baa34a1ea616a019529b9aab Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 4 Apr 2024 13:11:57 +0200 Subject: [PATCH 069/147] Adjust test --- .../csv/rejects/csv_rejects_two_tables.test | 48 ++++++++----------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/test/sql/copy/csv/rejects/csv_rejects_two_tables.test b/test/sql/copy/csv/rejects/csv_rejects_two_tables.test index e9ad454f6052..f50128989810 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_two_tables.test +++ b/test/sql/copy/csv/rejects/csv_rejects_two_tables.test @@ -17,20 +17,18 @@ BIGINT VARCHAR 11044 11044 2 query IIIIIIIIIIIII -SELECT * FROM reject_scans order by all; ---- 3 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL store_rejects=true, sample_size=1 3 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL store_rejects=true, sample_size=1 -query IIIIIIIII -SELECT * +query IIIIIIIIII FROM reject_errors order by all; ---- -3 0 2176 10875 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -3 0 4176 20875 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -3 1 3680 18395 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -3 1 5680 28395 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +3 0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +3 0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +3 1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +3 1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' # Test giving the name of errors table statement error @@ -54,20 +52,18 @@ SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), M BIGINT VARCHAR 11044 11044 2 query IIIIIIIIIIIII -SELECT * FROM reject_scans order by all; ---- 8 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 false {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_2', sample_size=1 8 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 false {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_2', sample_size=1 -query IIIIIIIII -SELECT * +query IIIIIIIIII FROM rejects_errors_2 order by all; ---- -8 0 2176 10875 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -8 0 4176 20875 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -8 1 3680 18395 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -8 1 5680 28395 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +8 0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +8 0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +8 1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +8 1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' statement ok drop table reject_errors; @@ -82,21 +78,18 @@ SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), M BIGINT VARCHAR 11044 11044 2 query IIIIIIIIIIIII -SELECT * FROM rejects_scan_2 order by all; ---- 12 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_2', sample_size=1 12 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_2', sample_size=1 -query IIIIIIIII -SELECT * +query IIIIIIIIII FROM reject_errors order by all; ---- -12 0 2176 10875 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -12 0 4176 20875 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -12 1 3680 18395 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -12 1 5680 28395 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' - +12 0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +12 0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +12 1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +12 1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' # Test giving the name of both tables query IIIII @@ -116,14 +109,13 @@ FROM rejects_scan_3 order by all; 15 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_3', rejects_scan='rejects_scan_3', sample_size=1 15 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_3', rejects_scan='rejects_scan_3', sample_size=1 -query IIIIIIIII -SELECT * +query IIIIIIIIII FROM rejects_errors_3 order by all; ---- -15 0 2176 10875 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -15 0 4176 20875 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -15 1 3680 18395 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -15 1 5680 28395 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +15 0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +15 0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +15 1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +15 1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' statement ok drop table reject_errors; From a6924b638a0851cbf4ef9909106500c8769840d4 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 4 Apr 2024 13:13:24 +0200 Subject: [PATCH 070/147] Small message adjustment --- test/sql/copy/csv/rejects/test_invalid_parameters.test | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/sql/copy/csv/rejects/test_invalid_parameters.test b/test/sql/copy/csv/rejects/test_invalid_parameters.test index 9325f3780f24..950337a90ebb 100644 --- a/test/sql/copy/csv/rejects/test_invalid_parameters.test +++ b/test/sql/copy/csv/rejects/test_invalid_parameters.test @@ -12,10 +12,10 @@ SELECT * FROM read_csv( 'test/sql/copy/csv/data/error/mismatch/bad.csv', columns = {'col0': 'INTEGER', 'col1': 'INTEGER', 'col2': 'VARCHAR'}, ignore_errors=false, - rejects_table='csv_rejects_table' + store_rejects=true ) ---- -only supported when IGNORE_ERRORS is set to true +STORE_REJECTS option is only supported when IGNORE_ERRORS is not manually set to false statement error SELECT * FROM read_csv( @@ -63,7 +63,7 @@ SELECT * FROM read_csv_auto( rejects_table='csv_rejects_table' ) ---- -only supported when IGNORE_ERRORS is set to true +option is only supported when IGNORE_ERRORS is not manually set to false statement error SELECT * FROM read_csv_auto( From 5f2883ff4e8d576d9698344c8e9309be2c754d66 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 4 Apr 2024 13:15:48 +0200 Subject: [PATCH 071/147] Adjustment to utf rejects --- .../csv/rejects/test_invalid_utf_rejects.test | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test index 94c56cc71562..e579648f8794 100644 --- a/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test +++ b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test @@ -9,13 +9,12 @@ require notwindows statement ok from read_csv('test/sql/copy/csv/data/test/invalid_utf_big.csv',columns = {'col1': 'VARCHAR','col2': 'VARCHAR','col3': 'VARCHAR'}, - auto_detect=false, rejects_table='csv_rejects_table', header = 0, delim = ',', ignore_errors=true) + auto_detect=false, header = 0, delim = ',', store_rejects=true) -query IIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position -FROM csv_rejects_table; +query IIIIIIIIII rowsort +FROM reject_errors ORDER BY ALL; ---- -test/sql/copy/csv/data/test/invalid_utf_big.csv 3001 2 "col2" INVALID UNICODE valid,invalid_??_part,valid 54000 -test/sql/copy/csv/data/test/invalid_utf_big.csv 3012 3 "col3" INVALID UNICODE valid,valid,invalid_??_part 54208 -test/sql/copy/csv/data/test/invalid_utf_big.csv 3023 2 "col2" INVALID UNICODE valid,invalid_??_part,valid 54416 -test/sql/copy/csv/data/test/invalid_utf_big.csv 3034 3 "col3" INVALID UNICODE valid,valid,invalid_??_part 54624 \ No newline at end of file +3 0 3001 54001 54007 2 col2 INVALID UNICODE valid,invalid_??_part,valid Invalid unicode (byte sequence mismatch) detected. +3 0 3012 54209 54221 3 col3 INVALID UNICODE valid,valid,invalid_??_part Invalid unicode (byte sequence mismatch) detected. +3 0 3023 54417 54423 2 col2 INVALID UNICODE valid,invalid_??_part,valid Invalid unicode (byte sequence mismatch) detected. +3 0 3034 54625 54637 3 col3 INVALID UNICODE valid,valid,invalid_??_part Invalid unicode (byte sequence mismatch) detected. \ No newline at end of file From 1ae2d3c66dd6a847bcec18a5f10a9126924b8e4a Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 4 Apr 2024 13:27:13 +0200 Subject: [PATCH 072/147] More adjustments --- test/sql/copy/csv/rejects/test_mixed.test | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/test/sql/copy/csv/rejects/test_mixed.test b/test/sql/copy/csv/rejects/test_mixed.test index 45001a5e4b05..e5ced0ea3ae8 100644 --- a/test/sql/copy/csv/rejects/test_mixed.test +++ b/test/sql/copy/csv/rejects/test_mixed.test @@ -11,8 +11,7 @@ query III SELECT * FROM read_csv( 'data/csv/rejects/frankstein/nightmare.csv', columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'VARCHAR'}, - rejects_table='csv_rejects_table', - ignore_errors=true, auto_detect=false, header = 1, max_line_size=20); + store_rejects = true, auto_detect=false, header = 1, max_line_size=20); ---- 1 2 pedro 1 2 pedro @@ -56,13 +55,12 @@ SELECT * FROM read_csv( 1 2 pedro 1 2 pedro -query IIIIIIII rowsort -SELECT regexp_replace(file, '\\', '/', 'g'), line, column_idx, column_name, error_type, csv_line, byte_position, error_message -FROM csv_rejects_table; +query IIIIIIIIII rowsort +FROM reject_errors ORDER BY ALL; ---- -data/csv/rejects/frankstein/nightmare.csv 10 2 "c" MISSING COLUMNS 1,2 102 Expected Number of Columns: 3 Found: 2 -data/csv/rejects/frankstein/nightmare.csv 14 4 NULL TOO MANY COLUMNS 1,2,"pedro",5 142 Expected Number of Columns: 3 Found: 4 -data/csv/rejects/frankstein/nightmare.csv 19 2 "b" CAST 1,bla,"pedro" 204 Error when converting column "b". Could not convert string "bla" to 'INTEGER' -data/csv/rejects/frankstein/nightmare.csv 22 3 "c" UNQUOTED VALUE 1,2,"pedro"bla 242 Value with unterminated quote found. -data/csv/rejects/frankstein/nightmare.csv 32 1 "a" LINE SIZE OVER MAXIMUM 1,2,"pedro thiago timbo holanda" 365 Maximum line size of 20 bytes exceeded. Actual Size:33 bytes. -data/csv/rejects/frankstein/nightmare.csv 38 3 "c" INVALID UNICODE 1,2,"pedro??" 458 Invalid unicode (byte sequence mismatch) detected. \ No newline at end of file +3 0 10 103 106 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 3 Found: 2 +3 0 14 143 154 4 NULL TOO MANY COLUMNS 1,2,"pedro",5 Expected Number of Columns: 3 Found: 4 +3 0 19 205 207 2 b CAST 1,bla,"pedro" Error when converting column "b". Could not convert string "bla" to 'INTEGER' +3 0 22 243 247 3 c UNQUOTED VALUE 1,2,"pedro"bla Value with unterminated quote found. +3 0 32 366 366 1 a LINE SIZE OVER MAXIMUM 1,2,"pedro thiago timbo holanda" Maximum line size of 20 bytes exceeded. Actual Size:33 bytes. +3 0 38 459 463 3 c INVALID UNICODE 1,2,"pedro??" Invalid unicode (byte sequence mismatch) detected. \ No newline at end of file From e58c90b1013e97653d75bbbbff2c7f219dca60aa Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 4 Apr 2024 15:36:48 +0200 Subject: [PATCH 073/147] Bunch of tests for multiple error return on borked lines --- data/csv/rejects/multiple_errors.csv | 8 ++ .../multiple_errors/cast_and_less_col.csv | 5 + .../multiple_errors/cast_and_more_col.csv | 5 + .../multiple_cast_implicit.csv | 4 + .../multiple_errors/multiple_casts_flush.csv | 4 + .../multiple_errors/multiple_casts_mixed.csv | 4 + .../scanner/string_value_scanner.cpp | 15 +- .../test_multiple_errors_same_line.test | 131 ++++++++++++++++++ 8 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 data/csv/rejects/multiple_errors.csv create mode 100644 data/csv/rejects/multiple_errors/cast_and_less_col.csv create mode 100644 data/csv/rejects/multiple_errors/cast_and_more_col.csv create mode 100644 data/csv/rejects/multiple_errors/multiple_cast_implicit.csv create mode 100644 data/csv/rejects/multiple_errors/multiple_casts_flush.csv create mode 100644 data/csv/rejects/multiple_errors/multiple_casts_mixed.csv create mode 100644 test/sql/copy/csv/rejects/test_multiple_errors_same_line.test diff --git a/data/csv/rejects/multiple_errors.csv b/data/csv/rejects/multiple_errors.csv new file mode 100644 index 000000000000..6d10e51c3cf5 --- /dev/null +++ b/data/csv/rejects/multiple_errors.csv @@ -0,0 +1,8 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boogie,3, 2023-01-02, 5 +oogie boogie,3, 2023-01-03, bla, 7 +oogie boogie,3, bla, bla, 7 +oogie boogie,3, 2023-01-04, 8 +oogie boogie,3, bla +oogie boogieoogie boogieoogie boogieoogie boogieoogie boogieoogie boogieoogie boogie,3, bla diff --git a/data/csv/rejects/multiple_errors/cast_and_less_col.csv b/data/csv/rejects/multiple_errors/cast_and_less_col.csv new file mode 100644 index 000000000000..25f4dfe159ed --- /dev/null +++ b/data/csv/rejects/multiple_errors/cast_and_less_col.csv @@ -0,0 +1,5 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boogie,3, 2023-01-02, 5 +oogie boogie,bla, 2023-01-03 +oogie boogie,bla diff --git a/data/csv/rejects/multiple_errors/cast_and_more_col.csv b/data/csv/rejects/multiple_errors/cast_and_more_col.csv new file mode 100644 index 000000000000..4e5a7d2321d3 --- /dev/null +++ b/data/csv/rejects/multiple_errors/cast_and_more_col.csv @@ -0,0 +1,5 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boogie,3, 2023-01-02, 5 +oogie boogie,3, 2023-01-03, bla, 7 +oogie boogie,3, 2023-01-03, bla, 7, 8 diff --git a/data/csv/rejects/multiple_errors/multiple_cast_implicit.csv b/data/csv/rejects/multiple_errors/multiple_cast_implicit.csv new file mode 100644 index 000000000000..26ad443f6fdc --- /dev/null +++ b/data/csv/rejects/multiple_errors/multiple_cast_implicit.csv @@ -0,0 +1,4 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boogie,3, 2023-01-02, 5 +oogie boogie,bla_2, 2023-01-02, bla_1 \ No newline at end of file diff --git a/data/csv/rejects/multiple_errors/multiple_casts_flush.csv b/data/csv/rejects/multiple_errors/multiple_casts_flush.csv new file mode 100644 index 000000000000..21d7a7b58b54 --- /dev/null +++ b/data/csv/rejects/multiple_errors/multiple_casts_flush.csv @@ -0,0 +1,4 @@ +name,age,current_day, tomorrow +oogie boogie,3, 2023-01-01, 2023-01-02 +oogie boogie,3, 2023-01-02, 2023-01-03 +oogie boogie,3, bla_2, bla_1 \ No newline at end of file diff --git a/data/csv/rejects/multiple_errors/multiple_casts_mixed.csv b/data/csv/rejects/multiple_errors/multiple_casts_mixed.csv new file mode 100644 index 000000000000..3931dbb1821b --- /dev/null +++ b/data/csv/rejects/multiple_errors/multiple_casts_mixed.csv @@ -0,0 +1,4 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boogie,3, 2023-01-02, 5 +oogie boogie,3, bla_2, bla_1 \ No newline at end of file diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index dd7c4364fb9d..0a0b286b4671 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -365,6 +365,7 @@ bool StringValueResult::HandleError() { switch (cur_error.type) { case CSVErrorType::TOO_MANY_COLUMNS: + case CSVErrorType::TOO_FEW_COLUMNS: if (current_line_position.begin == line_pos) { csv_error = CSVError::IncorrectColumnAmountError( state_machine.options, col_idx, lines_per_batch, borked_line, @@ -503,7 +504,15 @@ bool StringValueResult::AddRowInternal() { CSVError::LineSizeError(state_machine.options, current_line_size, lines_per_batch, borked_line, current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); error_handler.Error(csv_error); - number_of_rows--; + if (number_of_rows > 0) { + number_of_rows--; + } + } + if (!current_errors.empty()) { + // We need to add a few columns error + for (idx_t col_idx = cur_col_id; col_idx < number_of_columns; col_idx++) { + current_errors.push_back({CSVErrorType::TOO_FEW_COLUMNS, col_idx - 1, last_position}); + } } if (HandleError()) { return false; @@ -553,7 +562,9 @@ bool StringValueResult::AddRowInternal() { error_handler.Error(csv_error); } // If we are here we ignore_errors, so we delete this line - number_of_rows--; + if (number_of_rows > 0) { + number_of_rows--; + } } } line_positions_per_row[number_of_rows] = current_line_position; diff --git a/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test b/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test new file mode 100644 index 000000000000..22ed108fee20 --- /dev/null +++ b/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test @@ -0,0 +1,131 @@ +# name: test/sql/copy/csv/rejects/test_multiple_errors_same_line.test +# description: Tests a mix of multiple errors and validate they get hit +# group: [rejects] + +require skip_reload + +# Test will fail on windows because byte_position is slightly different due to \r\n instead of \n +require notwindows + +#query IIII +#FROM read_csv('data/csv/rejects/multiple_errors/cast_and_more_col.csv', +# columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, +# store_rejects = true, auto_detect=false, header = 1); +#---- +#oogie boogie 3 2023-01-01 2 +#oogie boogie 3 2023-01-02 5 +# +#query IIIIIIIII rowsort +#SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; +#---- +#0 4 89 116 4 barks CAST oogie boogie,3, 2023-01-03, bla, 7 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' +#0 4 89 120 5 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7 Expected Number of Columns: 4 Found: 5 +#0 5 124 151 4 barks CAST oogie boogie,3, 2023-01-03, bla, 7, 8 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' +#0 5 124 155 5 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7, 8 Expected Number of Columns: 4 Found: 5 +#0 5 124 158 6 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7, 8 Expected Number of Columns: 4 Found: 6 +# +#statement ok +#DROP TABLE reject_errors; +# +#statement ok +#DROP TABLE reject_scans; +# +#query IIII +#FROM read_csv('data/csv/rejects/multiple_errors/multiple_cast_implicit.csv', +# columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, +# store_rejects = true, auto_detect=false, header = 1); +#---- +#oogie boogie 3 2023-01-01 2 +#oogie boogie 3 2023-01-02 5 +# +#query IIIIIIIII rowsort +#SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; +#---- +#0 4 89 102 2 age CAST oogie boogie,bla_2, 2023-01-02, bla_1 Error when converting column "age". Could not convert string "bla_2" to 'INTEGER' +#0 4 89 120 4 barks CAST oogie boogie,bla_2, 2023-01-02, bla_1 Error when converting column "barks". Could not convert string " bla_1" to 'INTEGER' +# +#statement ok +#DROP TABLE reject_errors; +# +#statement ok +#DROP TABLE reject_scans; +# +#query IIII +#FROM read_csv('data/csv/rejects/multiple_errors/multiple_casts_flush.csv', +# columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'tomorrow': 'DATE'}, +# store_rejects = true, auto_detect=false, header = 1); +#---- +#oogie boogie 3 2023-01-01 2023-01-02 +#oogie boogie 3 2023-01-02 2023-01-03 +# +#query IIIIIIIII rowsort +#SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; +#---- +#0 4 110 NULL 3 current_day CAST oogie boogie,3, bla_2, bla_1 Error when converting column "current_day". date field value out of range: " bla_2", expected format is (YYYY-MM-DD) +#0 4 110 NULL 4 tomorrow CAST oogie boogie,3, bla_2, bla_1 Error when converting column "tomorrow". date field value out of range: " bla_1", expected format is (YYYY-MM-DD) +# +#statement ok +#DROP TABLE reject_errors; +# +#statement ok +#DROP TABLE reject_scans; +# +#query IIII +#FROM read_csv('data/csv/rejects/multiple_errors/multiple_casts_mixed.csv', +# columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, +# store_rejects = true, auto_detect=false, header = 1); +#---- +#oogie boogie 3 2023-01-01 2 +#oogie boogie 3 2023-01-02 5 +# +## FIXME: This will not present the both cast errors :'(, should be alleviated the more types we add to implicit casting +#query IIIIIIIII rowsort +#SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; +#---- +#0 4 89 111 4 barks CAST oogie boogie,3, bla_2, bla_1 Error when converting column "barks". Could not convert string " bla_1" to 'INTEGER' +# +#statement ok +#DROP TABLE reject_errors; +# +#statement ok +#DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/cast_and_less_col.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 5 + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; +---- +0 4 89 102 2 age CAST oogie boogie,bla, 2023-01-03 Error when converting column "age". Could not convert string "bla" to 'INTEGER' +0 4 89 117 3 barks MISSING COLUMNS oogie boogie,bla, 2023-01-03 Expected Number of Columns: 4 Found: 3 +0 5 118 131 2 age CAST oogie boogie,bla Error when converting column "age". Could not convert string "bla" to 'INTEGER' +0 5 118 134 2 current_day MISSING COLUMNS oogie boogie,bla Expected Number of Columns: 4 Found: 2 +0 5 118 134 3 barks MISSING COLUMNS oogie boogie,bla Expected Number of Columns: 4 Found: 3 + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +#query IIII +#FROM read_csv('data/csv/rejects/multiple_errors.csv', +# columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, +# store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +#---- +#oogie boogie 3 2023-01-01 2 +#oogie boogie 3 2023-01-02 5 +# +#query IIIIIIIIII rowsort +#FROM reject_errors ORDER BY ALL; +#---- +#3 0 4 89 116 4 barks CAST oogie boogie,3, 2023-01-03, bla, 7 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' +#3 0 4 89 120 5 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7 Expected Number of Columns: 4 Found: 5 +#3 0 5 124 144 4 barks CAST oogie boogie,3, bla, bla, 7 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' +#3 0 5 124 148 5 NULL TOO MANY COLUMNS oogie boogie,3, bla, bla, 7 Expected Number of Columns: 4 Found: 5 +#3 0 6 152 171 3 barks MISSING COLUMNS oogie boogie,3, bla Expected Number of Columns: 4 Found: 3 \ No newline at end of file From d8575d835221e2375b5f7f5bcaf2d9d188b0d9d7 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 4 Apr 2024 16:33:46 +0200 Subject: [PATCH 074/147] More tests on maxlinesize and some fixes --- .../multiple_errors/cast_and_maxline.csv | 4 + .../multiple_errors/less_col_and_max_line.csv | 4 + .../multiple_errors/more_col_and_max_line.csv | 4 + .../scanner/string_value_scanner.cpp | 22 +- .../table_function/global_csv_state.cpp | 7 +- .../csv_scanner/string_value_scanner.hpp | 1 + .../csv/rejects/csv_rejects_maximum_line.test | 18 +- test/sql/copy/csv/rejects/test_mixed.test | 2 +- .../test_multiple_errors_same_line.test | 226 +++++++++++------- 9 files changed, 181 insertions(+), 107 deletions(-) create mode 100644 data/csv/rejects/multiple_errors/cast_and_maxline.csv create mode 100644 data/csv/rejects/multiple_errors/less_col_and_max_line.csv create mode 100644 data/csv/rejects/multiple_errors/more_col_and_max_line.csv diff --git a/data/csv/rejects/multiple_errors/cast_and_maxline.csv b/data/csv/rejects/multiple_errors/cast_and_maxline.csv new file mode 100644 index 000000000000..e4e871e59462 --- /dev/null +++ b/data/csv/rejects/multiple_errors/cast_and_maxline.csv @@ -0,0 +1,4 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boogie,3, 2023-01-02, 5 +oogie boogieoogie boogieoogie boogieoogie boogie,bla, 2023-01-03, 4 diff --git a/data/csv/rejects/multiple_errors/less_col_and_max_line.csv b/data/csv/rejects/multiple_errors/less_col_and_max_line.csv new file mode 100644 index 000000000000..bb3dbe4dfc74 --- /dev/null +++ b/data/csv/rejects/multiple_errors/less_col_and_max_line.csv @@ -0,0 +1,4 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boogie,3, 2023-01-02, 5 +oogie boogieoogie boogieoogie boogieoogie boogie,bla, 2023-01-03 diff --git a/data/csv/rejects/multiple_errors/more_col_and_max_line.csv b/data/csv/rejects/multiple_errors/more_col_and_max_line.csv new file mode 100644 index 000000000000..27366cd56e5c --- /dev/null +++ b/data/csv/rejects/multiple_errors/more_col_and_max_line.csv @@ -0,0 +1,4 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boogie,3, 2023-01-02, 5 +oogie boogieoogie boogieoogie boogieoogie boogie,bla, 2023-01-03,4, bla diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index 0a0b286b4671..acade767b83e 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -424,7 +424,11 @@ bool StringValueResult::HandleError() { current_line_position.begin.GetGlobalPosition(requested_size, first_nl), line_pos.GetGlobalPosition(requested_size), parse_types[cur_error.col_idx].first); } - + break; + case CSVErrorType::MAXIMUM_LINE_SIZE: + csv_error = CSVError::LineSizeError( + state_machine.options, cur_error.current_line_size, lines_per_batch, borked_line, + current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); break; default: throw InvalidInputException("CSV Error not allowed when inserting row"); @@ -497,16 +501,8 @@ bool StringValueResult::AddRowInternal() { current_line_position.begin = current_line_position.end; current_line_position.end = current_line_start; if (current_line_size > state_machine.options.maximum_line_size) { - bool first_nl; - auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); - LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); - auto csv_error = - CSVError::LineSizeError(state_machine.options, current_line_size, lines_per_batch, borked_line, - current_line_position.begin.GetGlobalPosition(requested_size, first_nl)); - error_handler.Error(csv_error); - if (number_of_rows > 0) { - number_of_rows--; - } + current_errors.push_back({CSVErrorType::MAXIMUM_LINE_SIZE, 1, last_position}); + current_errors.back().current_line_size = current_line_size; } if (!current_errors.empty()) { // We need to add a few columns error @@ -562,9 +558,7 @@ bool StringValueResult::AddRowInternal() { error_handler.Error(csv_error); } // If we are here we ignore_errors, so we delete this line - if (number_of_rows > 0) { - number_of_rows--; - } + number_of_rows--; } } line_positions_per_row[number_of_rows] = current_line_position; diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index 6646f4ab98b8..b156f0af8b95 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -282,10 +282,15 @@ void CSVGlobalState::FillRejectsTable() { errors_appender.Append(error.byte_position + 1); } // 6. Column Index - errors_appender.Append(col_idx + 1); + if (error.type == CSVErrorType::MAXIMUM_LINE_SIZE) { + errors_appender.Append(Value()); + } else { + errors_appender.Append(col_idx + 1); + } // 7. Column Name (If Applicable) switch (error.type) { case CSVErrorType::TOO_MANY_COLUMNS: + case CSVErrorType::MAXIMUM_LINE_SIZE: errors_appender.Append(Value()); break; case CSVErrorType::TOO_FEW_COLUMNS: diff --git a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp index 0039f9ade5b0..7f4e6d8f017f 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/string_value_scanner.hpp @@ -74,6 +74,7 @@ class CurrentError { CSVErrorType type; idx_t col_idx; + idx_t current_line_size; string error_message; //! Exact Position where the error happened LinePosition error_position; diff --git a/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test b/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test index 1095a90d70f8..99cedc820614 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test +++ b/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test @@ -16,7 +16,7 @@ SELECT * FROM read_csv( query IIIIIIIIII FROM reject_errors order by all; ---- -3 0 5 23 23 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. +3 0 5 23 23 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. statement ok DROP TABLE reject_errors; @@ -37,7 +37,7 @@ SELECT * FROM read_csv( query IIIIIIIII SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -0 5 23 23 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. +0 5 23 23 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. statement ok DROP TABLE reject_errors; @@ -57,9 +57,9 @@ SELECT * FROM read_csv( query IIIIIIIIII FROM reject_errors order by all; ---- -27 0 2282 13685 13685 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. -27 0 2591 15558 15558 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. -27 0 2923 17569 17569 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. +27 0 2282 13685 13685 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. +27 0 2591 15558 15558 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. +27 0 2923 17569 17569 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. statement ok DROP TABLE reject_errors; @@ -77,10 +77,10 @@ SELECT * FROM read_csv( query IIIIIIIIII FROM reject_errors order by all; ---- -31 0 5 23 23 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. -31 1 2282 13685 13685 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. -31 1 2591 15558 15558 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. -31 1 2923 17569 17569 1 a LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. +31 0 5 23 23 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. +31 1 2282 13685 13685 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. +31 1 2591 15558 15558 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. +31 1 2923 17569 17569 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. statement ok DROP TABLE reject_errors; diff --git a/test/sql/copy/csv/rejects/test_mixed.test b/test/sql/copy/csv/rejects/test_mixed.test index e5ced0ea3ae8..d1f9b1decedc 100644 --- a/test/sql/copy/csv/rejects/test_mixed.test +++ b/test/sql/copy/csv/rejects/test_mixed.test @@ -62,5 +62,5 @@ FROM reject_errors ORDER BY ALL; 3 0 14 143 154 4 NULL TOO MANY COLUMNS 1,2,"pedro",5 Expected Number of Columns: 3 Found: 4 3 0 19 205 207 2 b CAST 1,bla,"pedro" Error when converting column "b". Could not convert string "bla" to 'INTEGER' 3 0 22 243 247 3 c UNQUOTED VALUE 1,2,"pedro"bla Value with unterminated quote found. -3 0 32 366 366 1 a LINE SIZE OVER MAXIMUM 1,2,"pedro thiago timbo holanda" Maximum line size of 20 bytes exceeded. Actual Size:33 bytes. +3 0 32 366 366 NULL NULL LINE SIZE OVER MAXIMUM 1,2,"pedro thiago timbo holanda" Maximum line size of 20 bytes exceeded. Actual Size:33 bytes. 3 0 38 459 463 3 c INVALID UNICODE 1,2,"pedro??" Invalid unicode (byte sequence mismatch) detected. \ No newline at end of file diff --git a/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test b/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test index 22ed108fee20..9c2243435322 100644 --- a/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test +++ b/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test @@ -7,88 +7,88 @@ require skip_reload # Test will fail on windows because byte_position is slightly different due to \r\n instead of \n require notwindows -#query IIII -#FROM read_csv('data/csv/rejects/multiple_errors/cast_and_more_col.csv', -# columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, -# store_rejects = true, auto_detect=false, header = 1); -#---- -#oogie boogie 3 2023-01-01 2 -#oogie boogie 3 2023-01-02 5 -# -#query IIIIIIIII rowsort -#SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; -#---- -#0 4 89 116 4 barks CAST oogie boogie,3, 2023-01-03, bla, 7 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' -#0 4 89 120 5 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7 Expected Number of Columns: 4 Found: 5 -#0 5 124 151 4 barks CAST oogie boogie,3, 2023-01-03, bla, 7, 8 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' -#0 5 124 155 5 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7, 8 Expected Number of Columns: 4 Found: 5 -#0 5 124 158 6 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7, 8 Expected Number of Columns: 4 Found: 6 -# -#statement ok -#DROP TABLE reject_errors; -# -#statement ok -#DROP TABLE reject_scans; -# -#query IIII -#FROM read_csv('data/csv/rejects/multiple_errors/multiple_cast_implicit.csv', -# columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, -# store_rejects = true, auto_detect=false, header = 1); -#---- -#oogie boogie 3 2023-01-01 2 -#oogie boogie 3 2023-01-02 5 -# -#query IIIIIIIII rowsort -#SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; -#---- -#0 4 89 102 2 age CAST oogie boogie,bla_2, 2023-01-02, bla_1 Error when converting column "age". Could not convert string "bla_2" to 'INTEGER' -#0 4 89 120 4 barks CAST oogie boogie,bla_2, 2023-01-02, bla_1 Error when converting column "barks". Could not convert string " bla_1" to 'INTEGER' -# -#statement ok -#DROP TABLE reject_errors; -# -#statement ok -#DROP TABLE reject_scans; -# -#query IIII -#FROM read_csv('data/csv/rejects/multiple_errors/multiple_casts_flush.csv', -# columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'tomorrow': 'DATE'}, -# store_rejects = true, auto_detect=false, header = 1); -#---- -#oogie boogie 3 2023-01-01 2023-01-02 -#oogie boogie 3 2023-01-02 2023-01-03 -# -#query IIIIIIIII rowsort -#SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; -#---- -#0 4 110 NULL 3 current_day CAST oogie boogie,3, bla_2, bla_1 Error when converting column "current_day". date field value out of range: " bla_2", expected format is (YYYY-MM-DD) -#0 4 110 NULL 4 tomorrow CAST oogie boogie,3, bla_2, bla_1 Error when converting column "tomorrow". date field value out of range: " bla_1", expected format is (YYYY-MM-DD) -# -#statement ok -#DROP TABLE reject_errors; -# -#statement ok -#DROP TABLE reject_scans; -# -#query IIII -#FROM read_csv('data/csv/rejects/multiple_errors/multiple_casts_mixed.csv', -# columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, -# store_rejects = true, auto_detect=false, header = 1); -#---- -#oogie boogie 3 2023-01-01 2 -#oogie boogie 3 2023-01-02 5 -# -## FIXME: This will not present the both cast errors :'(, should be alleviated the more types we add to implicit casting -#query IIIIIIIII rowsort -#SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; -#---- -#0 4 89 111 4 barks CAST oogie boogie,3, bla_2, bla_1 Error when converting column "barks". Could not convert string " bla_1" to 'INTEGER' -# -#statement ok -#DROP TABLE reject_errors; -# -#statement ok -#DROP TABLE reject_scans; +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/cast_and_more_col.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 5 + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; +---- +0 4 89 116 4 barks CAST oogie boogie,3, 2023-01-03, bla, 7 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' +0 4 89 120 5 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7 Expected Number of Columns: 4 Found: 5 +0 5 124 151 4 barks CAST oogie boogie,3, 2023-01-03, bla, 7, 8 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' +0 5 124 155 5 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7, 8 Expected Number of Columns: 4 Found: 5 +0 5 124 158 6 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7, 8 Expected Number of Columns: 4 Found: 6 + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/multiple_cast_implicit.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 5 + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; +---- +0 4 89 102 2 age CAST oogie boogie,bla_2, 2023-01-02, bla_1 Error when converting column "age". Could not convert string "bla_2" to 'INTEGER' +0 4 89 120 4 barks CAST oogie boogie,bla_2, 2023-01-02, bla_1 Error when converting column "barks". Could not convert string " bla_1" to 'INTEGER' + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/multiple_casts_flush.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'tomorrow': 'DATE'}, + store_rejects = true, auto_detect=false, header = 1); +---- +oogie boogie 3 2023-01-01 2023-01-02 +oogie boogie 3 2023-01-02 2023-01-03 + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; +---- +0 4 110 NULL 3 current_day CAST oogie boogie,3, bla_2, bla_1 Error when converting column "current_day". date field value out of range: " bla_2", expected format is (YYYY-MM-DD) +0 4 110 NULL 4 tomorrow CAST oogie boogie,3, bla_2, bla_1 Error when converting column "tomorrow". date field value out of range: " bla_1", expected format is (YYYY-MM-DD) + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/multiple_casts_mixed.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 5 + +# FIXME: This will not present the both cast errors :'(, should be alleviated the more types we add to implicit casting +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; +---- +0 4 89 111 4 barks CAST oogie boogie,3, bla_2, bla_1 Error when converting column "barks". Could not convert string " bla_1" to 'INTEGER' + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; query IIII FROM read_csv('data/csv/rejects/multiple_errors/cast_and_less_col.csv', @@ -113,6 +113,68 @@ DROP TABLE reject_errors; statement ok DROP TABLE reject_scans; +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/cast_and_maxline.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 5 + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; +---- +0 4 89 138 2 age CAST oogie boogieoogie boogieoogie boogieoogie boogie,bla, 2023-01-03, 4 Error when converting column "age". Could not convert string "bla" to 'INTEGER' +0 4 89 89 NULL NULL LINE SIZE OVER MAXIMUM oogie boogieoogie boogieoogie boogieoogie boogie,bla, 2023-01-03, 4 Maximum line size of 40 bytes exceeded. Actual Size:68 bytes. + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/less_col_and_max_line.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 5 + +query IIIIIIIII +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY byte_position; +---- +0 4 89 89 NULL NULL LINE SIZE OVER MAXIMUM oogie boogieoogie boogieoogie boogieoogie boogie,bla, 2023-01-03 Maximum line size of 40 bytes exceeded. Actual Size:65 bytes. +0 4 89 138 2 age CAST oogie boogieoogie boogieoogie boogieoogie boogie,bla, 2023-01-03 Error when converting column "age". Could not convert string "bla" to 'INTEGER' +0 4 89 153 3 barks MISSING COLUMNS oogie boogieoogie boogieoogie boogieoogie boogie,bla, 2023-01-03 Expected Number of Columns: 4 Found: 3 + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/more_col_and_max_line.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 5 + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY byte_position; +---- +0 4 89 138 2 age CAST oogie boogieoogie boogieoogie boogieoogie boogie,bla, 2023-01-03,4, bla Error when converting column "age". Could not convert string "bla" to 'INTEGER' +0 4 89 155 5 NULL TOO MANY COLUMNS oogie boogieoogie boogieoogie boogieoogie boogie,bla, 2023-01-03,4, bla Expected Number of Columns: 4 Found: 5 +0 4 89 89 NULL NULL LINE SIZE OVER MAXIMUM oogie boogieoogie boogieoogie boogieoogie boogie,bla, 2023-01-03,4, bla Maximum line size of 40 bytes exceeded. Actual Size:72 bytes. + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + #query IIII #FROM read_csv('data/csv/rejects/multiple_errors.csv', # columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, From 0cfb19acc9e040fa3293b9ad63852ffcbeeb2059 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 4 Apr 2024 16:46:46 +0200 Subject: [PATCH 075/147] Adding unquoted mix tests --- .../rejects/multiple_errors/unquoted_cast.csv | 5 ++ .../rejects/multiple_errors/unquoted_less.csv | 5 ++ .../multiple_errors/unquoted_maxline.csv | 5 ++ .../rejects/multiple_errors/unquoted_more.csv | 5 ++ .../test_multiple_errors_same_line.test | 82 +++++++++++++++++++ 5 files changed, 102 insertions(+) create mode 100644 data/csv/rejects/multiple_errors/unquoted_cast.csv create mode 100644 data/csv/rejects/multiple_errors/unquoted_less.csv create mode 100644 data/csv/rejects/multiple_errors/unquoted_maxline.csv create mode 100644 data/csv/rejects/multiple_errors/unquoted_more.csv diff --git a/data/csv/rejects/multiple_errors/unquoted_cast.csv b/data/csv/rejects/multiple_errors/unquoted_cast.csv new file mode 100644 index 000000000000..9cb8bf160c8c --- /dev/null +++ b/data/csv/rejects/multiple_errors/unquoted_cast.csv @@ -0,0 +1,5 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +"oogie boogie"bla,bla, 2023-01-02, 5 +oogie boogie,3, 2023-01-02, 7 + diff --git a/data/csv/rejects/multiple_errors/unquoted_less.csv b/data/csv/rejects/multiple_errors/unquoted_less.csv new file mode 100644 index 000000000000..5cd602581222 --- /dev/null +++ b/data/csv/rejects/multiple_errors/unquoted_less.csv @@ -0,0 +1,5 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +"oogie boogie"bla,4, 2023-01-02 +oogie boogie,3, 2023-01-02, 7 + diff --git a/data/csv/rejects/multiple_errors/unquoted_maxline.csv b/data/csv/rejects/multiple_errors/unquoted_maxline.csv new file mode 100644 index 000000000000..1dc1f8f2d505 --- /dev/null +++ b/data/csv/rejects/multiple_errors/unquoted_maxline.csv @@ -0,0 +1,5 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +"oogie boogieoogie boogieoogie boogieoogie boogie"bla,4, 2023-01-02, 5 +oogie boogie,3, 2023-01-02, 7 + diff --git a/data/csv/rejects/multiple_errors/unquoted_more.csv b/data/csv/rejects/multiple_errors/unquoted_more.csv new file mode 100644 index 000000000000..051e8cc90b86 --- /dev/null +++ b/data/csv/rejects/multiple_errors/unquoted_more.csv @@ -0,0 +1,5 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +"oogie boogie"bla,4, 2023-01-02, 5, 8 +oogie boogie,3, 2023-01-02, 7 + diff --git a/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test b/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test index 9c2243435322..112e7e0e2575 100644 --- a/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test +++ b/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test @@ -175,6 +175,88 @@ DROP TABLE reject_errors; statement ok DROP TABLE reject_scans; +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/unquoted_cast.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 7 + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY byte_position; +---- +0 3 59 59 1 name UNQUOTED VALUE "oogie boogie"bla,bla, 2023-01-02, 5 Value with unterminated quote found. +0 3 59 77 2 age CAST "oogie boogie"bla,bla, 2023-01-02, 5 Error when converting column "age". Could not convert string "bla" to 'INTEGER' + + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/unquoted_less.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 7 + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY byte_position; +---- +0 3 59 59 1 name UNQUOTED VALUE "oogie boogie"bla,4, 2023-01-02 Value with unterminated quote found. +0 3 59 90 3 barks MISSING COLUMNS "oogie boogie"bla,4, 2023-01-02 Expected Number of Columns: 4 Found: 3 + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/unquoted_maxline.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 7 + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY byte_position; +---- +0 3 59 59 1 name UNQUOTED VALUE "oogie boogieoogie boogieoogie boogieoogie boogie"bla,4, 2023-01-02, 5 Value with unterminated quote found. +0 3 59 59 NULL NULL LINE SIZE OVER MAXIMUM "oogie boogieoogie boogieoogie boogieoogie boogie"bla,4, 2023-01-02, 5 Maximum line size of 40 bytes exceeded. Actual Size:71 bytes. + + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/unquoted_more.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 7 + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY byte_position; +---- +0 3 59 59 1 name UNQUOTED VALUE "oogie boogie"bla,4, 2023-01-02, 5, 8 Value with unterminated quote found. +0 3 59 93 5 NULL TOO MANY COLUMNS "oogie boogie"bla,4, 2023-01-02, 5, 8 Expected Number of Columns: 4 Found: 5 + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + #query IIII #FROM read_csv('data/csv/rejects/multiple_errors.csv', # columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, From 8c184d6248e84f9791ecfc9dc5a3180d565619ba Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 4 Apr 2024 17:05:11 +0200 Subject: [PATCH 076/147] More tests --- .../multiple_errors/invalid_utf_cast.csv | 3 + .../multiple_errors/invalid_utf_less.csv | 3 + .../multiple_errors/invalid_utf_max_line.csv | 3 + .../multiple_errors/invalid_utf_more.csv | 3 + .../multiple_errors/invalid_utf_unquoted.csv | 3 + .../{ => multiple_errors}/multiple_errors.csv | 2 +- .../scanner/string_value_scanner.cpp | 15 +- .../test_multiple_errors_same_line.test | 136 +++++++++++++++--- 8 files changed, 141 insertions(+), 27 deletions(-) create mode 100644 data/csv/rejects/multiple_errors/invalid_utf_cast.csv create mode 100644 data/csv/rejects/multiple_errors/invalid_utf_less.csv create mode 100644 data/csv/rejects/multiple_errors/invalid_utf_max_line.csv create mode 100644 data/csv/rejects/multiple_errors/invalid_utf_more.csv create mode 100644 data/csv/rejects/multiple_errors/invalid_utf_unquoted.csv rename data/csv/rejects/{ => multiple_errors}/multiple_errors.csv (89%) diff --git a/data/csv/rejects/multiple_errors/invalid_utf_cast.csv b/data/csv/rejects/multiple_errors/invalid_utf_cast.csv new file mode 100644 index 000000000000..a4b2844afd03 --- /dev/null +++ b/data/csv/rejects/multiple_errors/invalid_utf_cast.csv @@ -0,0 +1,3 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boÿÿgie,bla, 2023-01-01, 2 \ No newline at end of file diff --git a/data/csv/rejects/multiple_errors/invalid_utf_less.csv b/data/csv/rejects/multiple_errors/invalid_utf_less.csv new file mode 100644 index 000000000000..adf74fc1e5fd --- /dev/null +++ b/data/csv/rejects/multiple_errors/invalid_utf_less.csv @@ -0,0 +1,3 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boÿÿgie,3, 2023-01-01 \ No newline at end of file diff --git a/data/csv/rejects/multiple_errors/invalid_utf_max_line.csv b/data/csv/rejects/multiple_errors/invalid_utf_max_line.csv new file mode 100644 index 000000000000..1f017d2d8cf7 --- /dev/null +++ b/data/csv/rejects/multiple_errors/invalid_utf_max_line.csv @@ -0,0 +1,3 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boÿÿgieoogie boogieoogie boogieoogie boogie,3, 2023-01-01, 2, 5 \ No newline at end of file diff --git a/data/csv/rejects/multiple_errors/invalid_utf_more.csv b/data/csv/rejects/multiple_errors/invalid_utf_more.csv new file mode 100644 index 000000000000..17fa55e72875 --- /dev/null +++ b/data/csv/rejects/multiple_errors/invalid_utf_more.csv @@ -0,0 +1,3 @@ +name,age,current_day, barks +oogie boogie,3, 2023-01-01, 2 +oogie boÿÿgie,3, 2023-01-01, 2, 5 \ No newline at end of file diff --git a/data/csv/rejects/multiple_errors/invalid_utf_unquoted.csv b/data/csv/rejects/multiple_errors/invalid_utf_unquoted.csv new file mode 100644 index 000000000000..3aaedb256d05 --- /dev/null +++ b/data/csv/rejects/multiple_errors/invalid_utf_unquoted.csv @@ -0,0 +1,3 @@ +name,last_name, age,current_day, barks +oogie, boogie,3, 2023-01-01, 2 +"oogie"bla, boÿÿgie,3, 2023-01-01, 2 \ No newline at end of file diff --git a/data/csv/rejects/multiple_errors.csv b/data/csv/rejects/multiple_errors/multiple_errors.csv similarity index 89% rename from data/csv/rejects/multiple_errors.csv rename to data/csv/rejects/multiple_errors/multiple_errors.csv index 6d10e51c3cf5..784fdd9d9faf 100644 --- a/data/csv/rejects/multiple_errors.csv +++ b/data/csv/rejects/multiple_errors/multiple_errors.csv @@ -3,6 +3,6 @@ oogie boogie,3, 2023-01-01, 2 oogie boogie,3, 2023-01-02, 5 oogie boogie,3, 2023-01-03, bla, 7 oogie boogie,3, bla, bla, 7 -oogie boogie,3, 2023-01-04, 8 +"oogie boogie"bla,3, 2023-01-04 oogie boogie,3, bla oogie boogieoogie boogieoogie boogieoogie boogieoogie boogieoogie boogieoogie boogie,3, bla diff --git a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp index acade767b83e..9f2682deadce 100644 --- a/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp +++ b/src/execution/operator/csv_scanner/scanner/string_value_scanner.cpp @@ -334,11 +334,6 @@ void StringValueResult::AddValue(StringValueResult &result, const idx_t buffer_p void StringValueResult::HandleUnicodeError(idx_t col_idx, LinePosition &error_position) { bool first_nl; auto borked_line = current_line_position.ReconstructCurrentLine(first_nl, buffer_handles); - // sanitize borked line - std::vector char_array(borked_line.begin(), borked_line.end()); - char_array.push_back('\0'); // Null-terminate the character array - Utf8Proc::MakeValid(&char_array[0], char_array.size()); - borked_line = {char_array.begin(), char_array.end() - 1}; LinesPerBoundary lines_per_batch(iterator.GetBoundaryIdx(), lines_read); if (current_line_position.begin == error_position) { auto csv_error = CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, @@ -379,11 +374,6 @@ bool StringValueResult::HandleError() { } break; case CSVErrorType::INVALID_UNICODE: { - // We have to sanitize the CSV line - std::vector char_array(borked_line.begin(), borked_line.end()); - char_array.push_back('\0'); // Null-terminate the character array - Utf8Proc::MakeValid(&char_array[0], char_array.size()); - borked_line = {char_array.begin(), char_array.end() - 1}; if (current_line_position.begin == line_pos) { csv_error = CSVError::InvalidUTF8(state_machine.options, col_idx, lines_per_batch, borked_line, @@ -489,6 +479,11 @@ string FullLinePosition::ReconstructCurrentLine(bool &first_char_nl, result += second_buffer[i]; } } + // sanitize borked line + std::vector char_array(result.begin(), result.end()); + char_array.push_back('\0'); // Null-terminate the character array + Utf8Proc::MakeValid(&char_array[0], char_array.size()); + result = {char_array.begin(), char_array.end() - 1}; return result; } diff --git a/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test b/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test index 112e7e0e2575..6d9f1fcb5baa 100644 --- a/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test +++ b/test/sql/copy/csv/rejects/test_multiple_errors_same_line.test @@ -257,19 +257,123 @@ DROP TABLE reject_errors; statement ok DROP TABLE reject_scans; -#query IIII -#FROM read_csv('data/csv/rejects/multiple_errors.csv', -# columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, -# store_rejects = true, auto_detect=false, header = 1, max_line_size=40); -#---- -#oogie boogie 3 2023-01-01 2 -#oogie boogie 3 2023-01-02 5 -# -#query IIIIIIIIII rowsort -#FROM reject_errors ORDER BY ALL; -#---- -#3 0 4 89 116 4 barks CAST oogie boogie,3, 2023-01-03, bla, 7 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' -#3 0 4 89 120 5 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7 Expected Number of Columns: 4 Found: 5 -#3 0 5 124 144 4 barks CAST oogie boogie,3, bla, bla, 7 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' -#3 0 5 124 148 5 NULL TOO MANY COLUMNS oogie boogie,3, bla, bla, 7 Expected Number of Columns: 4 Found: 5 -#3 0 6 152 171 3 barks MISSING COLUMNS oogie boogie,3, bla Expected Number of Columns: 4 Found: 3 \ No newline at end of file +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/invalid_utf_cast.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY byte_position; +---- +0 3 59 59 1 name INVALID UNICODE oogie bo??gie,bla, 2023-01-01, 2 Invalid unicode (byte sequence mismatch) detected. +0 3 59 73 2 age CAST oogie bo??gie,bla, 2023-01-01, 2 Error when converting column "age". Could not convert string "bla" to 'INTEGER' + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/invalid_utf_less.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 + + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY byte_position; +---- +0 3 59 59 1 name INVALID UNICODE oogie bo??gie,3, 2023-01-01 Invalid unicode (byte sequence mismatch) detected. +0 3 59 86 3 barks MISSING COLUMNS oogie bo??gie,3, 2023-01-01 Expected Number of Columns: 4 Found: 3 + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/invalid_utf_max_line.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 + + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY byte_position, error_message; +---- +0 3 59 125 5 NULL TOO MANY COLUMNS oogie bo??gieoogie boogieoogie boogieoogie boogie,3, 2023-01-01, 2, 5 Expected Number of Columns: 4 Found: 5 +0 3 59 59 1 name INVALID UNICODE oogie bo??gieoogie boogieoogie boogieoogie boogie,3, 2023-01-01, 2, 5 Invalid unicode (byte sequence mismatch) detected. +0 3 59 59 NULL NULL LINE SIZE OVER MAXIMUM oogie bo??gieoogie boogieoogie boogieoogie boogie,3, 2023-01-01, 2, 5 Maximum line size of 40 bytes exceeded. Actual Size:70 bytes. + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/invalid_utf_more.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 + + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY byte_position; +---- +0 3 59 59 1 name INVALID UNICODE oogie bo??gie,3, 2023-01-01, 2, 5 Invalid unicode (byte sequence mismatch) detected. +0 3 59 89 5 NULL TOO MANY COLUMNS oogie bo??gie,3, 2023-01-01, 2, 5 Expected Number of Columns: 4 Found: 5 + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIIII +FROM read_csv('data/csv/rejects/multiple_errors/invalid_utf_unquoted.csv', + columns = {'name': 'VARCHAR', 'last_name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 + + +query IIIIIIIII rowsort +SElECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY byte_position; +---- +0 3 71 71 1 name UNQUOTED VALUE "oogie"bla, bo??gie,3, 2023-01-01, 2 Value with unterminated quote found. +0 3 71 82 2 last_name INVALID UNICODE "oogie"bla, bo??gie,3, 2023-01-01, 2 Invalid unicode (byte sequence mismatch) detected. + +statement ok +DROP TABLE reject_errors; + +statement ok +DROP TABLE reject_scans; + +query IIII +FROM read_csv('data/csv/rejects/multiple_errors/multiple_errors.csv', + columns = {'name': 'VARCHAR', 'age': 'INTEGER', 'current_day': 'DATE', 'barks': 'INTEGER'}, + store_rejects = true, auto_detect=false, header = 1, max_line_size=40); +---- +oogie boogie 3 2023-01-01 2 +oogie boogie 3 2023-01-02 5 + +query IIIIIIIII rowsort +SELECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; +---- +0 4 89 116 4 barks CAST oogie boogie,3, 2023-01-03, bla, 7 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' +0 4 89 120 5 NULL TOO MANY COLUMNS oogie boogie,3, 2023-01-03, bla, 7 Expected Number of Columns: 4 Found: 5 +0 5 124 144 4 barks CAST oogie boogie,3, bla, bla, 7 Error when converting column "barks". Could not convert string " bla" to 'INTEGER' +0 5 124 148 5 NULL TOO MANY COLUMNS oogie boogie,3, bla, bla, 7 Expected Number of Columns: 4 Found: 5 +0 6 152 152 1 name UNQUOTED VALUE "oogie boogie"bla,3, 2023-01-04 Value with unterminated quote found. +0 6 152 183 3 barks MISSING COLUMNS "oogie boogie"bla,3, 2023-01-04 Expected Number of Columns: 4 Found: 3 +0 7 184 203 3 barks MISSING COLUMNS oogie boogie,3, bla Expected Number of Columns: 4 Found: 3 +0 8 204 204 NULL NULL LINE SIZE OVER MAXIMUM oogie boogieoogie boogieoogie boogieoogie boogieoogie boogieoogie boogieoogie boogie,3, bla Maximum line size of 40 bytes exceeded. Actual Size:92 bytes. +0 8 204 295 3 barks MISSING COLUMNS oogie boogieoogie boogieoogie boogieoogie boogieoogie boogieoogie boogieoogie boogie,3, bla Expected Number of Columns: 4 Found: 3 From 8dd5df622cf116b41d847a9c33e8cc26718b47cd Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Thu, 4 Apr 2024 17:05:26 +0200 Subject: [PATCH 077/147] woopsie on gen files --- .github/regression/micro_extended.csv | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/regression/micro_extended.csv b/.github/regression/micro_extended.csv index a9517ef309b4..6973785b4c98 100644 --- a/.github/regression/micro_extended.csv +++ b/.github/regression/micro_extended.csv @@ -78,7 +78,6 @@ benchmark/micro/copy/to_parquet_partition_by_few.benchmark benchmark/micro/copy/to_parquet_partition_by_many.benchmark benchmark/micro/csv/16_byte_values.benchmark benchmark/micro/csv/1_byte_values.benchmark -benchmark/micro/csv/1brl.benchmark benchmark/micro/csv/multiple_read.benchmark benchmark/micro/csv/multiple_small_read_csv.benchmark benchmark/micro/csv/null_padding.benchmark From 5efa63eaaca2f7bb6cf1c1e9e22c8058ecb3a2ea Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 5 Apr 2024 10:55:38 +0200 Subject: [PATCH 078/147] make tidy happy --- .../execution/operator/persistent/csv_rejects_table.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp index d00aede6687a..88dd86377dc7 100644 --- a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp +++ b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp @@ -1,5 +1,7 @@ #pragma once +#include + #include "duckdb/storage/object_cache.hpp" #include "duckdb/common/mutex.hpp" #include "duckdb/common/typedefs.hpp" @@ -15,7 +17,7 @@ class ClientContext; class CSVRejectsTable : public ObjectCacheEntry { public: CSVRejectsTable(string rejects_scan, string rejects_error) - : count(0), scan_table(rejects_scan), errors_table(rejects_error) { + : count(0), scan_table(std::move(rejects_scan)), errors_table(std::move(rejects_error)) { } mutex write_lock; string name; From 9c07c9318853a65ad36ca60f7184d9b3fc5ea332 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 5 Apr 2024 11:08:07 +0200 Subject: [PATCH 079/147] Make user_parameters ordered --- .../operator/csv_scanner/util/csv_reader_options.cpp | 6 +++++- .../execution/operator/csv_scanner/csv_reader_options.hpp | 1 + test/sql/copy/csv/rejects/csv_rejects_two_tables.test | 8 ++++---- test/sql/copy/csv/test_sniff_csv_options.test | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp index ebfe1ae306bd..6ecdc72989ae 100644 --- a/src/execution/operator/csv_scanner/util/csv_reader_options.cpp +++ b/src/execution/operator/csv_scanner/util/csv_reader_options.cpp @@ -386,6 +386,7 @@ bool StoreUserDefinedParameter(string &option) { } void CSVReaderOptions::FromNamedParameters(named_parameter_map_t &in, ClientContext &context, vector &return_types, vector &names) { + map ordered_user_defined_parameters; for (auto &kv : in) { if (MultiFileReader::ParseOption(kv.first, kv.second, file_options, context)) { continue; @@ -393,7 +394,7 @@ void CSVReaderOptions::FromNamedParameters(named_parameter_map_t &in, ClientCont auto loption = StringUtil::Lower(kv.first); // skip variables that are specific to auto detection if (StoreUserDefinedParameter(loption)) { - user_defined_parameters += loption + "=" + kv.second.ToSQLString() + ", "; + ordered_user_defined_parameters[loption] = kv.second.ToSQLString(); } if (loption == "columns") { auto &child_type = kv.second.type(); @@ -499,6 +500,9 @@ void CSVReaderOptions::FromNamedParameters(named_parameter_map_t &in, ClientCont SetReadOption(loption, kv.second, names); } } + for (auto &udf_parameter : ordered_user_defined_parameters) { + user_defined_parameters += udf_parameter.first + "=" + udf_parameter.second + ", "; + } if (user_defined_parameters.size() >= 2) { user_defined_parameters.erase(user_defined_parameters.size() - 2); } diff --git a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp index b937ccfc11d9..53a66da77838 100644 --- a/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp +++ b/src/include/duckdb/execution/operator/csv_scanner/csv_reader_options.hpp @@ -107,6 +107,7 @@ struct CSVReaderOptions { //! User defined parameters for the csv function concatenated on a string string user_defined_parameters; + //===--------------------------------------------------------------------===// // WriteCSVOptions //===--------------------------------------------------------------------===// diff --git a/test/sql/copy/csv/rejects/csv_rejects_two_tables.test b/test/sql/copy/csv/rejects/csv_rejects_two_tables.test index f50128989810..70fef75c473f 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_two_tables.test +++ b/test/sql/copy/csv/rejects/csv_rejects_two_tables.test @@ -19,8 +19,8 @@ BIGINT VARCHAR 11044 11044 2 query IIIIIIIIIIIII FROM reject_scans order by all; ---- -3 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL store_rejects=true, sample_size=1 -3 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL store_rejects=true, sample_size=1 +3 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL sample_size=1, store_rejects=true +3 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL sample_size=1, store_rejects=true query IIIIIIIIII FROM reject_errors order by all; @@ -106,8 +106,8 @@ query IIIIIIIIIIIII SELECT * FROM rejects_scan_3 order by all; ---- -15 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_3', rejects_scan='rejects_scan_3', sample_size=1 -15 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_3', rejects_scan='rejects_scan_3', sample_size=1 +15 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_3', rejects_table='rejects_errors_3', sample_size=1 +15 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_3', rejects_table='rejects_errors_3', sample_size=1 query IIIIIIIIII FROM rejects_errors_3 order by all; diff --git a/test/sql/copy/csv/test_sniff_csv_options.test b/test/sql/copy/csv/test_sniff_csv_options.test index 29144c0455bf..aa402aa58cb6 100644 --- a/test/sql/copy/csv/test_sniff_csv_options.test +++ b/test/sql/copy/csv/test_sniff_csv_options.test @@ -82,7 +82,7 @@ FROM sniff_csv('test/sql/copy/csv/data/auto/time_date_timestamp_yyyy.mm.dd.csv', query IIIIIIIIIII FROM sniff_csv('test/sql/copy/csv/data/auto/time_date_timestamp_yyyy.mm.dd.csv', dateformat='%Y.%m.%d', timestampformat='%Y.%m.%d %H:%M:%S') ---- -, " " \n 0 true {'a': 'BIGINT', 'b': 'VARCHAR', 't': 'TIME', 'd': 'DATE', 'ts': 'TIMESTAMP'} %Y.%m.%d %Y.%m.%d %H:%M:%S timestampformat='%Y.%m.%d %H:%M:%S', dateformat='%Y.%m.%d' FROM read_csv('test/sql/copy/csv/data/auto/time_date_timestamp_yyyy.mm.dd.csv', auto_detect=false, delim=',', quote='"', escape='"', new_line='\n', skip=0, header=true, columns={'a': 'BIGINT', 'b': 'VARCHAR', 't': 'TIME', 'd': 'DATE', 'ts': 'TIMESTAMP'}, timestampformat='%Y.%m.%d %H:%M:%S', dateformat='%Y.%m.%d'); +, " " \n 0 1 {'a': 'BIGINT', 'b': 'VARCHAR', 't': 'TIME', 'd': 'DATE', 'ts': 'TIMESTAMP'} %Y.%m.%d %Y.%m.%d %H:%M:%S dateformat='%Y.%m.%d', timestampformat='%Y.%m.%d %H:%M:%S' FROM read_csv('test/sql/copy/csv/data/auto/time_date_timestamp_yyyy.mm.dd.csv', auto_detect=false, delim=',', quote='"', escape='"', new_line='\n', skip=0, header=true, columns={'a': 'BIGINT', 'b': 'VARCHAR', 't': 'TIME', 'd': 'DATE', 'ts': 'TIMESTAMP'}, dateformat='%Y.%m.%d', timestampformat='%Y.%m.%d %H:%M:%S'); query IIIII FROM read_csv('test/sql/copy/csv/data/auto/time_date_timestamp_yyyy.mm.dd.csv', auto_detect=false, delim=',', quote='"', escape='"', new_line='\n', skip=0, header=true, columns={'a': 'BIGINT', 'b': 'VARCHAR', 't': 'TIME', 'd': 'DATE', 'ts': 'TIMESTAMP'}, timestampformat='%Y.%m.%d %H:%M:%S', dateformat='%Y.%m.%d') order by all limit 1; From e392ffc134cd17dd9bd928e0d84ee6f8223ed6ce Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 5 Apr 2024 11:16:55 +0200 Subject: [PATCH 080/147] Remove scan id from tests to make them more deterministic --- .../copy/csv/rejects/csv_rejects_auto.test | 12 +-- .../csv/rejects/csv_rejects_flush_cast.test | 8 +- .../csv/rejects/csv_rejects_maximum_line.test | 28 +++---- .../copy/csv/rejects/csv_rejects_read.test | 82 +++++++++---------- .../csv/rejects/csv_rejects_two_tables.test | 80 +++++++++--------- .../csv/rejects/test_invalid_utf_rejects.test | 12 +-- test/sql/copy/csv/rejects/test_mixed.test | 16 ++-- 7 files changed, 119 insertions(+), 119 deletions(-) diff --git a/test/sql/copy/csv/rejects/csv_rejects_auto.test b/test/sql/copy/csv/rejects/csv_rejects_auto.test index e673e9917287..15eecd053fc1 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_auto.test +++ b/test/sql/copy/csv/rejects/csv_rejects_auto.test @@ -15,13 +15,13 @@ SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), M ---- BIGINT VARCHAR 11044 11044 2 -query IIIIIIIIII rowsort -FROM reject_errors order by all; +query IIIIIIIII rowsort +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -3 0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -3 0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -3 1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -3 1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' statement ok DROP TABLE reject_errors; diff --git a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test index e6459aa5cd77..ba48d9fe2a99 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test +++ b/test/sql/copy/csv/rejects/csv_rejects_flush_cast.test @@ -17,9 +17,9 @@ SELECT typeof(first(a)), typeof(first(b)), COUNT(*) FROM read_csv( ---- DATE VARCHAR 2811 -query IIIIIIIIII -FROM reject_errors order by all; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -3 0 439 6997 NULL 1 a CAST B, bla Error when converting column "a". Could not parse string "B" according to format specifier "%d-%m-%Y" -3 0 2813 44972 NULL 1 a CAST c, bla Error when converting column "a". Could not parse string "c" according to format specifier "%d-%m-%Y" +0 439 6997 NULL 1 a CAST B, bla Error when converting column "a". Could not parse string "B" according to format specifier "%d-%m-%Y" +0 2813 44972 NULL 1 a CAST c, bla Error when converting column "a". Could not parse string "c" according to format specifier "%d-%m-%Y" diff --git a/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test b/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test index 99cedc820614..21ab80aacad5 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test +++ b/test/sql/copy/csv/rejects/csv_rejects_maximum_line.test @@ -13,10 +13,10 @@ SELECT * FROM read_csv( columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, store_rejects=true, auto_detect=false, header = 1, max_line_size=10); -query IIIIIIIIII -FROM reject_errors order by all; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -3 0 5 23 23 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. +0 5 23 23 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. statement ok DROP TABLE reject_errors; @@ -54,12 +54,12 @@ SELECT * FROM read_csv( columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, store_rejects = true, auto_detect=false, header = 1, max_line_size=20); -query IIIIIIIIII -FROM reject_errors order by all; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -27 0 2282 13685 13685 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. -27 0 2591 15558 15558 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. -27 0 2923 17569 17569 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. +0 2282 13685 13685 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. +0 2591 15558 15558 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. +0 2923 17569 17569 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 Maximum line size of 20 bytes exceeded. Actual Size:25 bytes. statement ok DROP TABLE reject_errors; @@ -74,13 +74,13 @@ SELECT * FROM read_csv( columns = {'a': 'VARCHAR', 'b': 'INTEGER'}, store_rejects = true, auto_detect=false, header = 1, max_line_size=10); -query IIIIIIIIII -FROM reject_errors order by all; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -31 0 5 23 23 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. -31 1 2282 13685 13685 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. -31 1 2591 15558 15558 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. -31 1 2923 17569 17569 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. +0 5 23 23 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaa,4 Maximum line size of 10 bytes exceeded. Actual Size:19 bytes. +1 2282 13685 13685 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. +1 2591 15558 15558 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,1 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. +1 2923 17569 17569 NULL NULL LINE SIZE OVER MAXIMUM blaaaaaaaaaaaaaaaaaaaa,3 Maximum line size of 10 bytes exceeded. Actual Size:25 bytes. statement ok DROP TABLE reject_errors; diff --git a/test/sql/copy/csv/rejects/csv_rejects_read.test b/test/sql/copy/csv/rejects/csv_rejects_read.test index b537833fd7dd..ba090366ac6e 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_read.test +++ b/test/sql/copy/csv/rejects/csv_rejects_read.test @@ -15,10 +15,10 @@ SELECT * FROM read_csv( 1 2 AAA 6 7 CCC -query IIIIIIIIII -FROM reject_errors; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors; ---- -3 0 2 10 12 2 col1 CAST 4,BBB,9, Error when converting column "col1". Could not convert string "BBB" to 'INTEGER' +0 2 10 12 2 col1 CAST 4,BBB,9, Error when converting column "col1". Could not convert string "BBB" to 'INTEGER' statement ok DROP TABLE reject_errors; @@ -35,12 +35,12 @@ SELECT * FROM read_csv( ---- 4 5 9 -query IIIIIIIIII -FROM reject_errors ORDER BY ALL; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; ---- -7 0 1 1 5 3 col2 CAST 1,2,DDD, Error when converting column "col2". Could not convert string "DDD" to 'INTEGER' -7 0 3 17 17 1 col0 CAST EEE,7,FFF, Error when converting column "col0". Could not convert string "EEE" to 'INTEGER' -7 0 3 17 23 3 col2 CAST EEE,7,FFF, Error when converting column "col2". Could not convert string "FFF" to 'INTEGER' +0 1 1 5 3 col2 CAST 1,2,DDD, Error when converting column "col2". Could not convert string "DDD" to 'INTEGER' +0 3 17 17 1 col0 CAST EEE,7,FFF, Error when converting column "col0". Could not convert string "EEE" to 'INTEGER' +0 3 17 23 3 col2 CAST EEE,7,FFF, Error when converting column "col2". Could not convert string "FFF" to 'INTEGER' statement ok DROP TABLE reject_errors; @@ -60,11 +60,11 @@ SELECT * FROM read_csv( 4 5 9 6 7 CCC -query IIIIIIIIII -FROM reject_errors ORDER BY ALL; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; ---- -11 0 2 10 12 2 col1 CAST 4,BBB,9, Error when converting column "col1". Could not convert string "BBB" to 'INTEGER' -11 1 3 17 17 1 col0 CAST EEE,7,FFF, Error when converting column "col0". Could not convert string "EEE" to 'INTEGER' +0 2 10 12 2 col1 CAST 4,BBB,9, Error when converting column "col1". Could not convert string "BBB" to 'INTEGER' +1 3 17 17 1 col0 CAST EEE,7,FFF, Error when converting column "col0". Could not convert string "EEE" to 'INTEGER' statement ok DROP TABLE reject_errors; @@ -106,11 +106,11 @@ SELECT SUM(num) FROM read_csv( ---- 4270 -query IIIIIIIIII -FROM reject_errors ORDER BY ALL; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; ---- -19 0 2176 10876 10876 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' -19 0 4176 20876 20876 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' +0 2176 10876 10876 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' +0 4176 20876 20876 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' statement ok DROP TABLE reject_errors; @@ -126,11 +126,11 @@ SELECT SUM(num) FROM read_csv( ---- 6774 -query IIIIIIIIII -FROM reject_errors ORDER BY ALL; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; ---- -23 0 3680 18396 18396 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' -23 0 5680 28396 28396 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' +0 3680 18396 18396 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' +0 5680 28396 28396 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' statement ok @@ -148,13 +148,13 @@ SELECT SUM(num) FROM read_csv( ---- 11044 -query IIIIIIIIII -FROM reject_errors ORDER BY ALL; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; ---- -27 0 2176 10876 10876 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' -27 0 4176 20876 20876 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' -27 1 3680 18396 18396 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' -27 1 5680 28396 28396 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' +0 2176 10876 10876 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' +0 4176 20876 20876 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' +1 3680 18396 18396 1 num CAST B, A Error when converting column "num". Could not convert string "B" to 'INTEGER' +1 5680 28396 28396 1 num CAST C, A Error when converting column "num". Could not convert string "C" to 'INTEGER' statement ok DROP TABLE reject_errors; @@ -178,20 +178,20 @@ ON L.num = R.num; 1 A 1 A 3 C 3 C -query IIIIIIIIIIIII -FROM reject_scans ORDER BY ALL; +query IIIIIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_scans ORDER BY ALL; ---- -31 0 test/sql/copy/csv/data/error/mismatch/small1.csv , " " \n 0 true {'num': 'INTEGER','str': 'VARCHAR'} NULL NULL store_rejects=true -31 1 test/sql/copy/csv/data/error/mismatch/small2.csv , " " \n 0 true {'num': 'INTEGER','str': 'VARCHAR'} NULL NULL store_rejects=true +0 test/sql/copy/csv/data/error/mismatch/small1.csv , " " \n 0 true {'num': 'INTEGER','str': 'VARCHAR'} NULL NULL store_rejects=true +1 test/sql/copy/csv/data/error/mismatch/small2.csv , " " \n 0 true {'num': 'INTEGER','str': 'VARCHAR'} NULL NULL store_rejects=true -query IIIIIIIIII -FROM reject_errors ORDER BY ALL; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; ---- -31 0 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' -31 0 6 27 27 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' -31 1 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' -31 1 5 23 23 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +0 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +0 6 27 27 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +1 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +1 5 23 23 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' statement ok DROP TABLE reject_errors; @@ -217,12 +217,12 @@ ON L.num = R.num; 3 C 3 C -query IIIIIIIIII -FROM reject_errors ORDER BY ALL; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; ---- -36 0 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' -36 0 6 27 27 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' -36 1 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +0 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +0 6 27 27 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' +1 3 15 15 1 num CAST X,Y Error when converting column "num". Could not convert string "X" to 'INTEGER' statement ok diff --git a/test/sql/copy/csv/rejects/csv_rejects_two_tables.test b/test/sql/copy/csv/rejects/csv_rejects_two_tables.test index 70fef75c473f..f856d929fa2e 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_two_tables.test +++ b/test/sql/copy/csv/rejects/csv_rejects_two_tables.test @@ -16,19 +16,19 @@ SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), M BIGINT VARCHAR 11044 11044 2 -query IIIIIIIIIIIII -FROM reject_scans order by all; +query IIIIIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_scans order by all; ---- -3 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL sample_size=1, store_rejects=true -3 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL sample_size=1, store_rejects=true +0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL sample_size=1, store_rejects=true +1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL sample_size=1, store_rejects=true -query IIIIIIIIII -FROM reject_errors order by all; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -3 0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -3 0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -3 1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -3 1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' # Test giving the name of errors table statement error @@ -51,19 +51,19 @@ SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), M ---- BIGINT VARCHAR 11044 11044 2 -query IIIIIIIIIIIII -FROM reject_scans order by all; +query IIIIIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_scans order by all; ---- -8 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 false {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_2', sample_size=1 -8 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 false {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_2', sample_size=1 +0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 false {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_2', sample_size=1 +1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 false {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_table='rejects_errors_2', sample_size=1 -query IIIIIIIIII -FROM rejects_errors_2 order by all; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM rejects_errors_2 order by all; ---- -8 0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -8 0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -8 1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -8 1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' statement ok drop table reject_errors; @@ -77,19 +77,19 @@ SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), M ---- BIGINT VARCHAR 11044 11044 2 -query IIIIIIIIIIIII -FROM rejects_scan_2 order by all; +query IIIIIIIIIIII +SELECT * EXCLUDE (scan_id) FROM rejects_scan_2 order by all; ---- -12 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_2', sample_size=1 -12 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_2', sample_size=1 +0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_2', sample_size=1 +1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_2', sample_size=1 -query IIIIIIIIII -FROM reject_errors order by all; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -12 0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -12 0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -12 1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -12 1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' # Test giving the name of both tables query IIIII @@ -102,20 +102,20 @@ SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), M ---- BIGINT VARCHAR 11044 11044 2 -query IIIIIIIIIIIII -SELECT * +query IIIIIIIIIIII +SELECT * EXCLUDE (scan_id) FROM rejects_scan_3 order by all; ---- -15 0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_3', rejects_table='rejects_errors_3', sample_size=1 -15 1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_3', rejects_table='rejects_errors_3', sample_size=1 +0 test/sql/copy/csv/data/error/mismatch/big_bad.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_3', rejects_table='rejects_errors_3', sample_size=1 +1 test/sql/copy/csv/data/error/mismatch/big_bad2.csv , \0 \0 \n 0 0 {'column0': 'BIGINT','column1': 'VARCHAR'} NULL NULL rejects_scan='rejects_scan_3', rejects_table='rejects_errors_3', sample_size=1 -query IIIIIIIIII -FROM rejects_errors_3 order by all; +query IIIIIIIII +SELECT * EXCLUDE (scan_id) FROM rejects_errors_3 order by all; ---- -15 0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -15 0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' -15 1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' -15 1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +0 2176 10876 10876 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +0 4176 20876 20876 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' +1 3680 18396 18396 1 column0 CAST B, A Error when converting column "column0". Could not convert string "B" to 'BIGINT' +1 5680 28396 28396 1 column0 CAST C, A Error when converting column "column0". Could not convert string "C" to 'BIGINT' statement ok drop table reject_errors; diff --git a/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test index e579648f8794..f6b9840b2130 100644 --- a/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test +++ b/test/sql/copy/csv/rejects/test_invalid_utf_rejects.test @@ -11,10 +11,10 @@ statement ok from read_csv('test/sql/copy/csv/data/test/invalid_utf_big.csv',columns = {'col1': 'VARCHAR','col2': 'VARCHAR','col3': 'VARCHAR'}, auto_detect=false, header = 0, delim = ',', store_rejects=true) -query IIIIIIIIII rowsort -FROM reject_errors ORDER BY ALL; +query IIIIIIIII rowsort +SELECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; ---- -3 0 3001 54001 54007 2 col2 INVALID UNICODE valid,invalid_??_part,valid Invalid unicode (byte sequence mismatch) detected. -3 0 3012 54209 54221 3 col3 INVALID UNICODE valid,valid,invalid_??_part Invalid unicode (byte sequence mismatch) detected. -3 0 3023 54417 54423 2 col2 INVALID UNICODE valid,invalid_??_part,valid Invalid unicode (byte sequence mismatch) detected. -3 0 3034 54625 54637 3 col3 INVALID UNICODE valid,valid,invalid_??_part Invalid unicode (byte sequence mismatch) detected. \ No newline at end of file +0 3001 54001 54007 2 col2 INVALID UNICODE valid,invalid_??_part,valid Invalid unicode (byte sequence mismatch) detected. +0 3012 54209 54221 3 col3 INVALID UNICODE valid,valid,invalid_??_part Invalid unicode (byte sequence mismatch) detected. +0 3023 54417 54423 2 col2 INVALID UNICODE valid,invalid_??_part,valid Invalid unicode (byte sequence mismatch) detected. +0 3034 54625 54637 3 col3 INVALID UNICODE valid,valid,invalid_??_part Invalid unicode (byte sequence mismatch) detected. \ No newline at end of file diff --git a/test/sql/copy/csv/rejects/test_mixed.test b/test/sql/copy/csv/rejects/test_mixed.test index d1f9b1decedc..54ef879556ab 100644 --- a/test/sql/copy/csv/rejects/test_mixed.test +++ b/test/sql/copy/csv/rejects/test_mixed.test @@ -55,12 +55,12 @@ SELECT * FROM read_csv( 1 2 pedro 1 2 pedro -query IIIIIIIIII rowsort -FROM reject_errors ORDER BY ALL; +query IIIIIIIII rowsort +SELECT * EXCLUDE (scan_id) FROM reject_errors ORDER BY ALL; ---- -3 0 10 103 106 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 3 Found: 2 -3 0 14 143 154 4 NULL TOO MANY COLUMNS 1,2,"pedro",5 Expected Number of Columns: 3 Found: 4 -3 0 19 205 207 2 b CAST 1,bla,"pedro" Error when converting column "b". Could not convert string "bla" to 'INTEGER' -3 0 22 243 247 3 c UNQUOTED VALUE 1,2,"pedro"bla Value with unterminated quote found. -3 0 32 366 366 NULL NULL LINE SIZE OVER MAXIMUM 1,2,"pedro thiago timbo holanda" Maximum line size of 20 bytes exceeded. Actual Size:33 bytes. -3 0 38 459 463 3 c INVALID UNICODE 1,2,"pedro??" Invalid unicode (byte sequence mismatch) detected. \ No newline at end of file +0 10 103 106 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 3 Found: 2 +0 14 143 154 4 NULL TOO MANY COLUMNS 1,2,"pedro",5 Expected Number of Columns: 3 Found: 4 +0 19 205 207 2 b CAST 1,bla,"pedro" Error when converting column "b". Could not convert string "bla" to 'INTEGER' +0 22 243 247 3 c UNQUOTED VALUE 1,2,"pedro"bla Value with unterminated quote found. +0 32 366 366 NULL NULL LINE SIZE OVER MAXIMUM 1,2,"pedro thiago timbo holanda" Maximum line size of 20 bytes exceeded. Actual Size:33 bytes. +0 38 459 463 3 c INVALID UNICODE 1,2,"pedro??" Invalid unicode (byte sequence mismatch) detected. \ No newline at end of file From 8e5ebeffbafcbb947ee973c96368f726bd21304b Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Fri, 5 Apr 2024 11:27:58 +0200 Subject: [PATCH 081/147] Letss see if our CI can take this --- test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow b/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow index 07b99f83c601..e53ef286a495 100644 --- a/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow +++ b/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow @@ -13,9 +13,6 @@ copy lineitem to '__TEST_DIR__/lineitem.csv.gz'; statement ok SET temp_directory='' -# load the DB from disk (Avoids OOM when generating ze table) -load __TEST_DIR__/lineitem_compressed.db - statement ok CREATE TABLE lineitem_2(l_orderkey INTEGER NOT NULL, l_partkey INTEGER NOT NULL, l_suppkey INTEGER NOT NULL, l_linenumber INTEGER NOT NULL, l_quantity DECIMAL(15,2) NOT NULL, l_extendedprice DECIMAL(15,2) NOT NULL, l_discount DECIMAL(15,2) NOT NULL, l_tax DECIMAL(15,2) NOT NULL, l_returnflag VARCHAR NOT NULL, l_linestatus VARCHAR NOT NULL, l_shipdate DATE NOT NULL, l_commitdate DATE NOT NULL, l_receiptdate DATE NOT NULL, l_shipinstruct VARCHAR NOT NULL, l_shipmode VARCHAR NOT NULL, l_comment VARCHAR NOT NULL); From fbdf2f86b5464871f8431221bff3efa47fd55e06 Mon Sep 17 00:00:00 2001 From: Max Gabrielsson Date: Mon, 8 Apr 2024 10:16:15 +0200 Subject: [PATCH 082/147] initializes unknown indexes on catalog lookup --- .../catalog_entry/duck_schema_entry.cpp | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/catalog/catalog_entry/duck_schema_entry.cpp b/src/catalog/catalog_entry/duck_schema_entry.cpp index 105270d49b41..cbd78d374847 100644 --- a/src/catalog/catalog_entry/duck_schema_entry.cpp +++ b/src/catalog/catalog_entry/duck_schema_entry.cpp @@ -36,8 +36,8 @@ namespace duckdb { -void FindForeignKeyInformation(CatalogEntry &entry, AlterForeignKeyType alter_fk_type, - vector> &fk_arrays) { +static void FindForeignKeyInformation(CatalogEntry &entry, AlterForeignKeyType alter_fk_type, + vector> &fk_arrays) { if (entry.type != CatalogType::TABLE_ENTRY) { return; } @@ -63,6 +63,19 @@ void FindForeignKeyInformation(CatalogEntry &entry, AlterForeignKeyType alter_fk } } +static void LazyLoadIndexes(ClientContext &context, CatalogEntry &entry) { + if (entry.type == CatalogType::TABLE_ENTRY) { + auto &table_entry = entry.Cast(); + table_entry.GetStorage().info->InitializeIndexes(context); + } else if (entry.type == CatalogType::INDEX_ENTRY) { + auto &index_entry = entry.Cast(); + auto &table_entry = Catalog::GetEntry(context, CatalogType::TABLE_ENTRY, index_entry.catalog.GetName(), + index_entry.GetSchemaName(), index_entry.GetTableName()) + .Cast(); + table_entry.GetStorage().info->InitializeIndexes(context); + } +} + DuckSchemaEntry::DuckSchemaEntry(Catalog &catalog, CreateSchemaInfo &info) : SchemaCatalogEntry(catalog, info), tables(catalog, make_uniq(catalog, *this)), indexes(catalog), table_functions(catalog), copy_functions(catalog), pragma_functions(catalog), @@ -287,6 +300,9 @@ void DuckSchemaEntry::DropEntry(ClientContext &context, DropInfo &info) { CatalogTypeToString(existing_entry->type), CatalogTypeToString(info.type)); } + // if this is a index or table with indexes, initialize any unknown index instances + LazyLoadIndexes(context, *existing_entry); + // if there is a foreign key constraint, get that information vector> fk_arrays; FindForeignKeyInformation(*existing_entry, AlterForeignKeyType::AFT_DELETE, fk_arrays); From 57d12c7803fe2b81de8bc804f4bc38f16d54d2bd Mon Sep 17 00:00:00 2001 From: Max Gabrielsson Date: Mon, 8 Apr 2024 10:35:10 +0200 Subject: [PATCH 083/147] remove other initialization --- src/function/table/table_scan.cpp | 3 --- src/storage/data_table.cpp | 4 ---- src/storage/local_storage.cpp | 4 ---- 3 files changed, 11 deletions(-) diff --git a/src/function/table/table_scan.cpp b/src/function/table/table_scan.cpp index 097642361756..26a9843775e7 100644 --- a/src/function/table/table_scan.cpp +++ b/src/function/table/table_scan.cpp @@ -306,9 +306,6 @@ void TableScanPushdownComplexFilter(ClientContext &context, LogicalGet &get, Fun return; } - // Lazily initialize any unknown indexes that might have been loaded by an extension - storage.info->InitializeIndexes(context); - // behold storage.info->indexes.Scan([&](Index &index) { // first rewrite the index expression so the ColumnBindings align with the column bindings of the current table diff --git a/src/storage/data_table.cpp b/src/storage/data_table.cpp index ac55613c9295..0a3e0ed034f3 100644 --- a/src/storage/data_table.cpp +++ b/src/storage/data_table.cpp @@ -89,8 +89,6 @@ DataTable::DataTable(ClientContext &context, DataTable &parent, idx_t removed_co column_definitions.emplace_back(column_def.Copy()); } - // try to initialize unknown indexes - info->InitializeIndexes(context); // first check if there are any indexes that exist that point to the removed column info->indexes.Scan([&](Index &index) { for (auto &column_id : index.column_ids) { @@ -155,8 +153,6 @@ DataTable::DataTable(ClientContext &context, DataTable &parent, idx_t changed_id for (auto &column_def : parent.column_definitions) { column_definitions.emplace_back(column_def.Copy()); } - // try to initialize unknown indexes - info->InitializeIndexes(context); // first check if there are any indexes that exist that point to the changed column info->indexes.Scan([&](Index &index) { diff --git a/src/storage/local_storage.cpp b/src/storage/local_storage.cpp index 2c7fb0fe1d79..791847b102eb 100644 --- a/src/storage/local_storage.cpp +++ b/src/storage/local_storage.cpp @@ -470,9 +470,6 @@ void LocalStorage::Flush(DataTable &table, LocalTableStorage &storage) { storage.AppendToIndexes(transaction, append_state, append_count, true); } - // try to initialize any unknown indexes - table.info->InitializeIndexes(context); - // possibly vacuum any excess index data table.info->indexes.Scan([&](Index &index) { index.Vacuum(); @@ -575,7 +572,6 @@ TableIndexList &LocalStorage::GetIndexes(DataTable &table) { if (!storage) { throw InternalException("LocalStorage::GetIndexes - local storage not found"); } - table.info->InitializeIndexes(context); return storage->indexes; } From 3b561cb8a6821a4bc0797b4b9253ec248437e90e Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 8 Apr 2024 10:59:22 +0200 Subject: [PATCH 084/147] Remove extra catalog function --- src/catalog/catalog.cpp | 11 ----------- .../operator/persistent/csv_rejects_table.cpp | 4 ++-- src/include/duckdb/catalog/catalog.hpp | 2 -- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/src/catalog/catalog.cpp b/src/catalog/catalog.cpp index 7e865661f862..775af179588d 100644 --- a/src/catalog/catalog.cpp +++ b/src/catalog/catalog.cpp @@ -773,17 +773,6 @@ CatalogEntry &Catalog::GetEntry(ClientContext &context, const string &schema, co throw CatalogException("CatalogElement \"%s.%s\" does not exist!", schema, name); } -bool Catalog::EntryExists(ClientContext &context, const string &schema, const string &name) { - vector entry_types {CatalogType::TABLE_ENTRY, CatalogType::SEQUENCE_ENTRY}; - for (auto entry_type : entry_types) { - auto result = GetEntry(context, entry_type, schema, name, OnEntryNotFound::RETURN_NULL); - if (result) { - return true; - } - } - return false; -} - optional_ptr Catalog::GetEntry(ClientContext &context, CatalogType type, const string &schema_name, const string &name, OnEntryNotFound if_not_found, QueryErrorContext error_context) { diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index 31f63d0279b8..4d3248bf3944 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -25,8 +25,8 @@ shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, "CSV_REJECTS_TABLE_CACHE_ENTRY_" + StringUtil::Upper(rejects_scan) + "_" + StringUtil::Upper(rejects_error); auto &cache = ObjectCache::GetObjectCache(context); auto &catalog = Catalog::GetCatalog(context, TEMP_CATALOG); - bool rejects_scan_exist = catalog.EntryExists(context, DEFAULT_SCHEMA, rejects_scan); - bool rejects_error_exist = catalog.EntryExists(context, DEFAULT_SCHEMA, rejects_error); + auto rejects_scan_exist = catalog.GetEntry(context, CatalogType::TABLE_ENTRY, DEFAULT_SCHEMA, rejects_scan, OnEntryNotFound::RETURN_NULL) != nullptr; + auto rejects_error_exist = catalog.GetEntry(context, CatalogType::TABLE_ENTRY, DEFAULT_SCHEMA, rejects_error, OnEntryNotFound::RETURN_NULL) != nullptr; if ((rejects_scan_exist || rejects_error_exist) && !cache.Get(key)) { std::ostringstream error; if (rejects_scan_exist) { diff --git a/src/include/duckdb/catalog/catalog.hpp b/src/include/duckdb/catalog/catalog.hpp index 654fe4a3569c..244a5f362037 100644 --- a/src/include/duckdb/catalog/catalog.hpp +++ b/src/include/duckdb/catalog/catalog.hpp @@ -232,8 +232,6 @@ class Catalog { //! Gets the "schema.name" entry without a specified type, if entry does not exist an exception is thrown DUCKDB_API CatalogEntry &GetEntry(ClientContext &context, const string &schema, const string &name); - //! Returns true if the "schema.name" entry without a specified type exists - DUCKDB_API bool EntryExists(ClientContext &context, const string &schema, const string &name); //! Fetches a logical type from the catalog DUCKDB_API LogicalType GetType(ClientContext &context, const string &schema, const string &names, From df3d7ff3d0c8cba66f14bc67f15dd383b081d811 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 8 Apr 2024 11:24:29 +0200 Subject: [PATCH 085/147] more pr requests, adding tests and assertions --- .../operator/persistent/csv_rejects_table.cpp | 12 +++++++-- src/include/duckdb/catalog/catalog.hpp | 1 - .../csv/rejects/csv_rejects_two_tables.test | 26 ++++++++++++++++--- third_party/utf8proc/utf8proc_wrapper.cpp | 4 ++- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index 4d3248bf3944..1cd98d85fe81 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -21,12 +21,19 @@ TableCatalogEntry &CSVRejectsTable::GetScansTable(ClientContext &context) { shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, const string &rejects_scan, const string &rejects_error) { + // Check that these names can't be the same + if (rejects_scan == rejects_error) { + throw BinderException("The names of the rejects scan and rejects error tables can't be the same. Use different " + "names for these tables."); + } auto key = "CSV_REJECTS_TABLE_CACHE_ENTRY_" + StringUtil::Upper(rejects_scan) + "_" + StringUtil::Upper(rejects_error); auto &cache = ObjectCache::GetObjectCache(context); auto &catalog = Catalog::GetCatalog(context, TEMP_CATALOG); - auto rejects_scan_exist = catalog.GetEntry(context, CatalogType::TABLE_ENTRY, DEFAULT_SCHEMA, rejects_scan, OnEntryNotFound::RETURN_NULL) != nullptr; - auto rejects_error_exist = catalog.GetEntry(context, CatalogType::TABLE_ENTRY, DEFAULT_SCHEMA, rejects_error, OnEntryNotFound::RETURN_NULL) != nullptr; + auto rejects_scan_exist = catalog.GetEntry(context, CatalogType::TABLE_ENTRY, DEFAULT_SCHEMA, rejects_scan, + OnEntryNotFound::RETURN_NULL) != nullptr; + auto rejects_error_exist = catalog.GetEntry(context, CatalogType::TABLE_ENTRY, DEFAULT_SCHEMA, rejects_error, + OnEntryNotFound::RETURN_NULL) != nullptr; if ((rejects_scan_exist || rejects_error_exist) && !cache.Get(key)) { std::ostringstream error; if (rejects_scan_exist) { @@ -38,6 +45,7 @@ shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, error << "Either drop the used name(s), or give other name options in the CSV Reader function.\n"; throw BinderException(error.str()); } + return cache.GetOrCreate(key, rejects_scan, rejects_error); } diff --git a/src/include/duckdb/catalog/catalog.hpp b/src/include/duckdb/catalog/catalog.hpp index 244a5f362037..871738a975de 100644 --- a/src/include/duckdb/catalog/catalog.hpp +++ b/src/include/duckdb/catalog/catalog.hpp @@ -232,7 +232,6 @@ class Catalog { //! Gets the "schema.name" entry without a specified type, if entry does not exist an exception is thrown DUCKDB_API CatalogEntry &GetEntry(ClientContext &context, const string &schema, const string &name); - //! Fetches a logical type from the catalog DUCKDB_API LogicalType GetType(ClientContext &context, const string &schema, const string &names, OnEntryNotFound if_not_found); diff --git a/test/sql/copy/csv/rejects/csv_rejects_two_tables.test b/test/sql/copy/csv/rejects/csv_rejects_two_tables.test index f856d929fa2e..902031695f9f 100644 --- a/test/sql/copy/csv/rejects/csv_rejects_two_tables.test +++ b/test/sql/copy/csv/rejects/csv_rejects_two_tables.test @@ -156,8 +156,7 @@ SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), M rejects_scan = 't' ); ---- -Reject Scan Table name "t" is already in use. Reject Error Table name "t" is already in use. Either drop the used name(s), or give other name options in the CSV Reader function. - +The names of the rejects scan and rejects error tables can't be the same. Use different names for these tables. # Test giving the name of the tables with store_rejects and/or ignore_errors set to false throws statement error @@ -230,4 +229,25 @@ SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), M store_rejects = false ); ---- -REJECTS_SCAN option is only supported when store_rejects is not manually set to false \ No newline at end of file +REJECTS_SCAN option is only supported when store_rejects is not manually set to false + +# Add a test where both tables have the same name (This should fail, because they both have the same name) +statement error +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_scan = 'same_name_because_why_not', + rejects_table = 'same_name_because_why_not', + store_rejects = true + ); +---- +The names of the rejects scan and rejects error tables can't be the same. Use different names for these tables. + +# This hopefully doesn't fail because the names don't get registered if they fail. +statement ok +SELECT typeof(first(column0)), typeof(first(column1)), COUNT(*), SUM(column0), MAX(len(column1)) FROM read_csv_auto( + 'test/sql/copy/csv/data/error/mismatch/big_bad*.csv', + sample_size=1, + rejects_scan = 'same_name_because_why_not', + rejects_table = 'same_name_because_why_not_2', + store_rejects = true); diff --git a/third_party/utf8proc/utf8proc_wrapper.cpp b/third_party/utf8proc/utf8proc_wrapper.cpp index 02f6c0efc5de..9ff3615e8a89 100644 --- a/third_party/utf8proc/utf8proc_wrapper.cpp +++ b/third_party/utf8proc/utf8proc_wrapper.cpp @@ -1,6 +1,6 @@ #include "utf8proc_wrapper.hpp" #include "utf8proc.hpp" - +#include "duckdb/common/assert.hpp" using namespace std; namespace duckdb { @@ -103,6 +103,7 @@ UnicodeType Utf8Proc::Analyze(const char *s, size_t len, UnicodeInvalidReason *i } void Utf8Proc::MakeValid(char *s, size_t len, char special_flag){ + D_ASSERT(special_flag <=127); UnicodeType type = UnicodeType::ASCII; for (size_t i = 0; i < len; i++) { int c = (int) s[i]; @@ -133,6 +134,7 @@ void Utf8Proc::MakeValid(char *s, size_t len, char special_flag){ type = UnicodeType::ASCII; } } + D_ASSERT(Utf8Proc::IsValid(s,len)); } char* Utf8Proc::Normalize(const char *s, size_t len) { From 7e078316e8aba6465ec2e7382ea9ce428a28aa9e Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 8 Apr 2024 12:42:09 +0200 Subject: [PATCH 086/147] Also print error_line when throwing --- src/execution/operator/csv_scanner/util/csv_error.cpp | 4 ++++ test/sql/copy/csv/csv_error_message.test | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/execution/operator/csv_scanner/util/csv_error.cpp b/src/execution/operator/csv_scanner/util/csv_error.cpp index da9f5d5e2435..d22738f21415 100644 --- a/src/execution/operator/csv_scanner/util/csv_error.cpp +++ b/src/execution/operator/csv_scanner/util/csv_error.cpp @@ -19,12 +19,16 @@ void CSVErrorHandler::ThrowError(CSVError csv_error) { std::ostringstream error; if (PrintLineNumber(csv_error)) { error << "CSV Error on Line: " << GetLine(csv_error.error_info) << '\n'; + if (!csv_error.csv_row.empty()) { + error << "Original Line: " << csv_error.csv_row << '\n'; + } } if (csv_error.full_error_message.empty()) { error << csv_error.error_message; } else { error << csv_error.full_error_message; } + switch (csv_error.type) { case CSVErrorType::CAST_ERROR: throw ConversionException(error.str()); diff --git a/test/sql/copy/csv/csv_error_message.test b/test/sql/copy/csv/csv_error_message.test index b41d049fe65d..b51e98dd87e9 100644 --- a/test/sql/copy/csv/csv_error_message.test +++ b/test/sql/copy/csv/csv_error_message.test @@ -26,3 +26,7 @@ SELECT * FROM read_csv('__TEST_DIR__/int_parse_error.csv', columns={'i': 'INT'}, ---- Line: 104 +statement error +SELECT * FROM read_csv('__TEST_DIR__/int_parse_error.csv', columns={'i': 'INT'}, header=False, auto_detect=false) +---- +Original Line: hello \ No newline at end of file From 2f7be72713dfaf306a62abf3d7d9374149bf2bb1 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 8 Apr 2024 12:53:21 +0200 Subject: [PATCH 087/147] Current File Index --- .../csv_scanner/table_function/global_csv_state.cpp | 2 +- src/execution/operator/persistent/csv_rejects_table.cpp | 8 ++++++++ .../execution/operator/persistent/csv_rejects_table.hpp | 9 ++++++++- src/include/duckdb/transaction/transaction_context.hpp | 2 -- src/transaction/transaction_context.cpp | 6 ------ 5 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index b156f0af8b95..1759450ef41a 100644 --- a/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -246,7 +246,7 @@ void CSVGlobalState::FillRejectsTable() { InternalAppender scans_appender(context, scans_table); idx_t scan_idx = context.transaction.GetActiveQuery(); for (auto &file : file_scans) { - idx_t file_idx = context.transaction.GetIncrementalIndex(); + idx_t file_idx = rejects->GetCurrentFileIndex(scan_idx); auto file_name = file->file_path; auto &errors = file->error_handler->errors; // We first insert the file into the file scans table diff --git a/src/execution/operator/persistent/csv_rejects_table.cpp b/src/execution/operator/persistent/csv_rejects_table.cpp index 1cd98d85fe81..11e8c1b0edf0 100644 --- a/src/execution/operator/persistent/csv_rejects_table.cpp +++ b/src/execution/operator/persistent/csv_rejects_table.cpp @@ -19,6 +19,14 @@ TableCatalogEntry &CSVRejectsTable::GetScansTable(ClientContext &context) { return table_entry; } +idx_t CSVRejectsTable::GetCurrentFileIndex(idx_t query_id) { + if (current_query_id != query_id) { + current_query_id = query_id; + current_file_idx = 0; + } + return current_file_idx++; +} + shared_ptr CSVRejectsTable::GetOrCreate(ClientContext &context, const string &rejects_scan, const string &rejects_error) { // Check that these names can't be the same diff --git a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp index 88dd86377dc7..d1075a4fbffe 100644 --- a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp +++ b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp @@ -32,7 +32,8 @@ class CSVRejectsTable : public ObjectCacheEntry { TableCatalogEntry &GetErrorsTable(ClientContext &context); TableCatalogEntry &GetScansTable(ClientContext &context); -public: + idx_t GetCurrentFileIndex(idx_t query_id); + static string ObjectType() { return "csv_rejects_table_cache"; } @@ -40,6 +41,12 @@ class CSVRejectsTable : public ObjectCacheEntry { string GetObjectType() override { return ObjectType(); } + +private: + //! Current File Index being used in the query + idx_t current_file_idx = 0; + //! Current Query ID being executed + idx_t current_query_id = 0; }; } // namespace duckdb diff --git a/src/include/duckdb/transaction/transaction_context.hpp b/src/include/duckdb/transaction/transaction_context.hpp index b265c0131498..b0a50103bb46 100644 --- a/src/include/duckdb/transaction/transaction_context.hpp +++ b/src/include/duckdb/transaction/transaction_context.hpp @@ -48,7 +48,6 @@ class TransactionContext { } idx_t GetActiveQuery(); - idx_t GetIncrementalIndex(); void ResetActiveQuery(); void SetActiveQuery(transaction_t query_number); @@ -57,7 +56,6 @@ class TransactionContext { bool auto_commit; unique_ptr current_transaction; - idx_t incremental_index = 0; TransactionContext(const TransactionContext &) = delete; }; diff --git a/src/transaction/transaction_context.cpp b/src/transaction/transaction_context.cpp index 82d1fa43094f..7185a263894b 100644 --- a/src/transaction/transaction_context.cpp +++ b/src/transaction/transaction_context.cpp @@ -89,19 +89,13 @@ idx_t TransactionContext::GetActiveQuery() { return current_transaction->GetActiveQuery(); } -idx_t TransactionContext::GetIncrementalIndex() { - return incremental_index++; -} - void TransactionContext::ResetActiveQuery() { - incremental_index = 0; if (current_transaction) { SetActiveQuery(MAXIMUM_QUERY_ID); } } void TransactionContext::SetActiveQuery(transaction_t query_number) { - incremental_index = 0; if (!current_transaction) { throw InternalException("SetActiveQuery called without active transaction"); } From 88454e4ce7511ed656da38406096002a36c360f1 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 8 Apr 2024 14:57:15 +0200 Subject: [PATCH 088/147] Maybe this is fine for the serializer? --- .../duckdb/storage/serialization/nodes.json | 49 +++++++++------- src/storage/serialization/serialize_nodes.cpp | 58 ++++++++++--------- 2 files changed, 59 insertions(+), 48 deletions(-) diff --git a/src/include/duckdb/storage/serialization/nodes.json b/src/include/duckdb/storage/serialization/nodes.json index acf730100b86..034db172d9db 100644 --- a/src/include/duckdb/storage/serialization/nodes.json +++ b/src/include/duckdb/storage/serialization/nodes.json @@ -537,7 +537,8 @@ "members": [ {"id": 100, "name": "ignore_errors", - "type": "CSVOption" + "type": "CSVOption", + "default": "false" }, {"id": 101, "name": "buffer_sample_size", @@ -604,64 +605,72 @@ "type": "vector" }, {"id": 117, - "name": "store_rejects", - "type": "CSVOption" + "name": "rejects_table_name", + "type": "CSVOption", + "default": "{\"reject_errors\"}" }, {"id": 118, "name": "rejects_limit", "type": "idx_t" }, {"id": 119, + "name": "rejects_recovery_columns", + "type": "vector", + "deleted": true + }, + {"id": 120, + "name": "rejects_recovery_column_ids", + "type": "vector", + "deleted": true + }, + {"id": 121, "name": "dialect_options.state_machine_options.delimiter", "type": "CSVOption" }, - {"id": 120, + {"id": 122, "name": "dialect_options.state_machine_options.quote", "type": "CSVOption" }, - {"id": 121, + {"id": 123, "name": "dialect_options.state_machine_options.escape", "type": "CSVOption" }, - {"id": 122, + {"id": 124, "name": "dialect_options.header", "type": "CSVOption" }, - {"id": 123, + {"id": 125, "name": "dialect_options.num_cols", "type": "idx_t" }, - {"id": 124, + {"id": 126, "name": "dialect_options.state_machine_options.new_line", "type": "CSVOption" }, - {"id": 125, + {"id": 127, "name": "dialect_options.skip_rows", "type": "CSVOption" }, - {"id": 126, + {"id": 128, "name": "dialect_options.date_format", "type": "map>" }, - {"id": 127, + {"id": 129, "name": "sniffer_user_mismatch_error", "type": "string" }, - {"id": 128, + {"id": 130, "name": "parallel", "type": "bool" }, - {"id": 129, - "name": "rejects_table_name", - "type": "CSVOption" - }, - {"id": 130, - "name": "rejects_scan_name", - "type": "CSVOption" - }, {"id": 131, "name": "was_type_manually_set", "type": "vector" + }, + {"id": 132, + "name": "rejects_scan_name", + "type": "CSVOption", + "default": "{\"reject_scans\"}" } ], "pointer_type": "none" diff --git a/src/storage/serialization/serialize_nodes.cpp b/src/storage/serialization/serialize_nodes.cpp index 2359d127ef24..a0d459a938f0 100644 --- a/src/storage/serialization/serialize_nodes.cpp +++ b/src/storage/serialization/serialize_nodes.cpp @@ -118,7 +118,7 @@ CSVOption CSVOption::Deserialize(Deserializer &deserializer) { } void CSVReaderOptions::Serialize(Serializer &serializer) const { - serializer.WriteProperty>(100, "ignore_errors", ignore_errors); + serializer.WritePropertyWithDefault>(100, "ignore_errors", ignore_errors, false); serializer.WritePropertyWithDefault(101, "buffer_sample_size", buffer_sample_size); serializer.WritePropertyWithDefault(102, "null_str", null_str); serializer.WriteProperty(103, "compression", compression); @@ -135,26 +135,27 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault(114, "buffer_size", buffer_size); serializer.WriteProperty(115, "file_options", file_options); serializer.WritePropertyWithDefault>(116, "force_quote", force_quote); - serializer.WriteProperty>(117, "store_rejects", store_rejects); + serializer.WritePropertyWithDefault>(117, "rejects_table_name", rejects_table_name, {"reject_errors"}); serializer.WritePropertyWithDefault(118, "rejects_limit", rejects_limit); - serializer.WriteProperty>(119, "dialect_options.state_machine_options.delimiter", dialect_options.state_machine_options.delimiter); - serializer.WriteProperty>(120, "dialect_options.state_machine_options.quote", dialect_options.state_machine_options.quote); - serializer.WriteProperty>(121, "dialect_options.state_machine_options.escape", dialect_options.state_machine_options.escape); - serializer.WriteProperty>(122, "dialect_options.header", dialect_options.header); - serializer.WritePropertyWithDefault(123, "dialect_options.num_cols", dialect_options.num_cols); - serializer.WriteProperty>(124, "dialect_options.state_machine_options.new_line", dialect_options.state_machine_options.new_line); - serializer.WriteProperty>(125, "dialect_options.skip_rows", dialect_options.skip_rows); - serializer.WriteProperty>>(126, "dialect_options.date_format", dialect_options.date_format); - serializer.WritePropertyWithDefault(127, "sniffer_user_mismatch_error", sniffer_user_mismatch_error); - serializer.WritePropertyWithDefault(128, "parallel", parallel); - serializer.WriteProperty>(129, "rejects_table_name", rejects_table_name); - serializer.WriteProperty>(130, "rejects_scan_name", rejects_scan_name); + /* [Deleted] (vector) "rejects_recovery_columns" */ + /* [Deleted] (vector) "rejects_recovery_column_ids" */ + serializer.WriteProperty>(121, "dialect_options.state_machine_options.delimiter", dialect_options.state_machine_options.delimiter); + serializer.WriteProperty>(122, "dialect_options.state_machine_options.quote", dialect_options.state_machine_options.quote); + serializer.WriteProperty>(123, "dialect_options.state_machine_options.escape", dialect_options.state_machine_options.escape); + serializer.WriteProperty>(124, "dialect_options.header", dialect_options.header); + serializer.WritePropertyWithDefault(125, "dialect_options.num_cols", dialect_options.num_cols); + serializer.WriteProperty>(126, "dialect_options.state_machine_options.new_line", dialect_options.state_machine_options.new_line); + serializer.WriteProperty>(127, "dialect_options.skip_rows", dialect_options.skip_rows); + serializer.WriteProperty>>(128, "dialect_options.date_format", dialect_options.date_format); + serializer.WritePropertyWithDefault(129, "sniffer_user_mismatch_error", sniffer_user_mismatch_error); + serializer.WritePropertyWithDefault(130, "parallel", parallel); serializer.WritePropertyWithDefault>(131, "was_type_manually_set", was_type_manually_set); + serializer.WritePropertyWithDefault>(132, "rejects_scan_name", rejects_scan_name, {"reject_scans"}); } CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { CSVReaderOptions result; - deserializer.ReadProperty>(100, "ignore_errors", result.ignore_errors); + deserializer.ReadPropertyWithDefault>(100, "ignore_errors", result.ignore_errors, false); deserializer.ReadPropertyWithDefault(101, "buffer_sample_size", result.buffer_sample_size); deserializer.ReadPropertyWithDefault(102, "null_str", result.null_str); deserializer.ReadProperty(103, "compression", result.compression); @@ -171,21 +172,22 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { deserializer.ReadPropertyWithDefault(114, "buffer_size", result.buffer_size); deserializer.ReadProperty(115, "file_options", result.file_options); deserializer.ReadPropertyWithDefault>(116, "force_quote", result.force_quote); - deserializer.ReadProperty>(117, "store_rejects", result.store_rejects); + deserializer.ReadPropertyWithDefault>(117, "rejects_table_name", result.rejects_table_name, {"reject_errors"}); deserializer.ReadPropertyWithDefault(118, "rejects_limit", result.rejects_limit); - deserializer.ReadProperty>(119, "dialect_options.state_machine_options.delimiter", result.dialect_options.state_machine_options.delimiter); - deserializer.ReadProperty>(120, "dialect_options.state_machine_options.quote", result.dialect_options.state_machine_options.quote); - deserializer.ReadProperty>(121, "dialect_options.state_machine_options.escape", result.dialect_options.state_machine_options.escape); - deserializer.ReadProperty>(122, "dialect_options.header", result.dialect_options.header); - deserializer.ReadPropertyWithDefault(123, "dialect_options.num_cols", result.dialect_options.num_cols); - deserializer.ReadProperty>(124, "dialect_options.state_machine_options.new_line", result.dialect_options.state_machine_options.new_line); - deserializer.ReadProperty>(125, "dialect_options.skip_rows", result.dialect_options.skip_rows); - deserializer.ReadProperty>>(126, "dialect_options.date_format", result.dialect_options.date_format); - deserializer.ReadPropertyWithDefault(127, "sniffer_user_mismatch_error", result.sniffer_user_mismatch_error); - deserializer.ReadPropertyWithDefault(128, "parallel", result.parallel); - deserializer.ReadProperty>(129, "rejects_table_name", result.rejects_table_name); - deserializer.ReadProperty>(130, "rejects_scan_name", result.rejects_scan_name); + deserializer.ReadDeletedProperty>(119, "rejects_recovery_columns"); + deserializer.ReadDeletedProperty>(120, "rejects_recovery_column_ids"); + deserializer.ReadProperty>(121, "dialect_options.state_machine_options.delimiter", result.dialect_options.state_machine_options.delimiter); + deserializer.ReadProperty>(122, "dialect_options.state_machine_options.quote", result.dialect_options.state_machine_options.quote); + deserializer.ReadProperty>(123, "dialect_options.state_machine_options.escape", result.dialect_options.state_machine_options.escape); + deserializer.ReadProperty>(124, "dialect_options.header", result.dialect_options.header); + deserializer.ReadPropertyWithDefault(125, "dialect_options.num_cols", result.dialect_options.num_cols); + deserializer.ReadProperty>(126, "dialect_options.state_machine_options.new_line", result.dialect_options.state_machine_options.new_line); + deserializer.ReadProperty>(127, "dialect_options.skip_rows", result.dialect_options.skip_rows); + deserializer.ReadProperty>>(128, "dialect_options.date_format", result.dialect_options.date_format); + deserializer.ReadPropertyWithDefault(129, "sniffer_user_mismatch_error", result.sniffer_user_mismatch_error); + deserializer.ReadPropertyWithDefault(130, "parallel", result.parallel); deserializer.ReadPropertyWithDefault>(131, "was_type_manually_set", result.was_type_manually_set); + deserializer.ReadPropertyWithDefault>(132, "rejects_scan_name", result.rejects_scan_name, {"reject_scans"}); return result; } From 2fb9648b2f33226a4d1254fa4a6da326e9f76030 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 8 Apr 2024 15:03:23 +0200 Subject: [PATCH 089/147] Go away utility --- .../duckdb/execution/operator/persistent/csv_rejects_table.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp index d1075a4fbffe..7b88e0de80cb 100644 --- a/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp +++ b/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp @@ -1,7 +1,5 @@ #pragma once -#include - #include "duckdb/storage/object_cache.hpp" #include "duckdb/common/mutex.hpp" #include "duckdb/common/typedefs.hpp" From feea4ce0fc4f7ed0ad3fd1d97dff415e1fe19650 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 8 Apr 2024 15:08:29 +0200 Subject: [PATCH 090/147] Update this test for smaller SF --- .../test_multiple_big_compressed_csvs.test_slow | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow b/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow index e53ef286a495..4c8116b7d3cd 100644 --- a/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow +++ b/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow @@ -5,19 +5,22 @@ require tpch statement ok -CALL dbgen(sf=10); +CALL dbgen(sf=1); + +statement ok +SET temp_directory='' statement ok copy lineitem to '__TEST_DIR__/lineitem.csv.gz'; statement ok -SET temp_directory='' +DROP TABLE lineitem; statement ok -CREATE TABLE lineitem_2(l_orderkey INTEGER NOT NULL, l_partkey INTEGER NOT NULL, l_suppkey INTEGER NOT NULL, l_linenumber INTEGER NOT NULL, l_quantity DECIMAL(15,2) NOT NULL, l_extendedprice DECIMAL(15,2) NOT NULL, l_discount DECIMAL(15,2) NOT NULL, l_tax DECIMAL(15,2) NOT NULL, l_returnflag VARCHAR NOT NULL, l_linestatus VARCHAR NOT NULL, l_shipdate DATE NOT NULL, l_commitdate DATE NOT NULL, l_receiptdate DATE NOT NULL, l_shipinstruct VARCHAR NOT NULL, l_shipmode VARCHAR NOT NULL, l_comment VARCHAR NOT NULL); +CREATE TABLE lineitem(l_orderkey INTEGER NOT NULL, l_partkey INTEGER NOT NULL, l_suppkey INTEGER NOT NULL, l_linenumber INTEGER NOT NULL, l_quantity DECIMAL(15,2) NOT NULL, l_extendedprice DECIMAL(15,2) NOT NULL, l_discount DECIMAL(15,2) NOT NULL, l_tax DECIMAL(15,2) NOT NULL, l_returnflag VARCHAR NOT NULL, l_linestatus VARCHAR NOT NULL, l_shipdate DATE NOT NULL, l_commitdate DATE NOT NULL, l_receiptdate DATE NOT NULL, l_shipinstruct VARCHAR NOT NULL, l_shipmode VARCHAR NOT NULL, l_comment VARCHAR NOT NULL); statement ok -INSERT INTO lineitem_2 FROM read_csv([ +INSERT INTO lineitem FROM read_csv([ '__TEST_DIR__/lineitem.csv.gz', '__TEST_DIR__/lineitem.csv.gz', '__TEST_DIR__/lineitem.csv.gz', @@ -45,6 +48,6 @@ INSERT INTO lineitem_2 FROM read_csv([ ]); query I -select count(*) from lineitem_2 +select count(*) from lineitem ---- -1439665248 \ No newline at end of file +144029160 \ No newline at end of file From 3870eda4a057b20a35e8addf19c0a2f70a407b3d Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Mon, 8 Apr 2024 16:51:46 +0200 Subject: [PATCH 091/147] some special code for options --- .../duckdb/common/serializer/deserializer.hpp | 12 ++++++++++++ .../duckdb/common/serializer/serializer.hpp | 16 ++++++++++++++++ .../duckdb/storage/serialization/nodes.json | 6 +++--- src/storage/serialization/serialize_nodes.cpp | 8 ++++---- 4 files changed, 35 insertions(+), 7 deletions(-) diff --git a/src/include/duckdb/common/serializer/deserializer.hpp b/src/include/duckdb/common/serializer/deserializer.hpp index 000104c43c60..e4096878a810 100644 --- a/src/include/duckdb/common/serializer/deserializer.hpp +++ b/src/include/duckdb/common/serializer/deserializer.hpp @@ -114,6 +114,18 @@ class Deserializer { OnOptionalPropertyEnd(true); } + template + inline void ReadPropertyWithDefault(const field_id_t field_id, const char *tag, CSVOption &ret, + T &&default_value) { + if (!OnOptionalPropertyBegin(field_id, tag)) { + ret = std::forward(default_value); + OnOptionalPropertyEnd(false); + return; + } + ret = Read(); + OnOptionalPropertyEnd(true); + } + // Special case: // Read into an existing data_ptr_t inline void ReadProperty(const field_id_t field_id, const char *tag, data_ptr_t ret, idx_t count) { diff --git a/src/include/duckdb/common/serializer/serializer.hpp b/src/include/duckdb/common/serializer/serializer.hpp index 10b926d048e1..f791b4a892df 100644 --- a/src/include/duckdb/common/serializer/serializer.hpp +++ b/src/include/duckdb/common/serializer/serializer.hpp @@ -17,6 +17,7 @@ #include "duckdb/common/unordered_set.hpp" #include "duckdb/common/optional_idx.hpp" #include "duckdb/common/value_operations/value_operations.hpp" +#include "duckdb/execution/operator/csv_scanner/csv_option.hpp" namespace duckdb { @@ -86,6 +87,21 @@ class Serializer { OnOptionalPropertyEnd(true); } + // Specialization for Value (default Value comparison throws when comparing nulls) + template + void WritePropertyWithDefault(const field_id_t field_id, const char *tag, const CSVOption &value, + const T &&default_value) { + // If current value is default, don't write it + if (!serialize_default_values && (value == default_value)) { + OnOptionalPropertyBegin(field_id, tag, false); + OnOptionalPropertyEnd(false); + return; + } + OnOptionalPropertyBegin(field_id, tag, true); + WriteValue(value.GetValue()); + OnOptionalPropertyEnd(true); + } + // Special case: data_ptr_T void WriteProperty(const field_id_t field_id, const char *tag, const_data_ptr_t ptr, idx_t count) { OnPropertyBegin(field_id, tag); diff --git a/src/include/duckdb/storage/serialization/nodes.json b/src/include/duckdb/storage/serialization/nodes.json index 034db172d9db..547be7329ad8 100644 --- a/src/include/duckdb/storage/serialization/nodes.json +++ b/src/include/duckdb/storage/serialization/nodes.json @@ -537,7 +537,7 @@ "members": [ {"id": 100, "name": "ignore_errors", - "type": "CSVOption", + "type": "bool", "default": "false" }, {"id": 101, @@ -606,8 +606,8 @@ }, {"id": 117, "name": "rejects_table_name", - "type": "CSVOption", - "default": "{\"reject_errors\"}" + "type": "string", + "default": "\"reject_errors\"" }, {"id": 118, "name": "rejects_limit", diff --git a/src/storage/serialization/serialize_nodes.cpp b/src/storage/serialization/serialize_nodes.cpp index a0d459a938f0..39529e35e786 100644 --- a/src/storage/serialization/serialize_nodes.cpp +++ b/src/storage/serialization/serialize_nodes.cpp @@ -118,7 +118,7 @@ CSVOption CSVOption::Deserialize(Deserializer &deserializer) { } void CSVReaderOptions::Serialize(Serializer &serializer) const { - serializer.WritePropertyWithDefault>(100, "ignore_errors", ignore_errors, false); + serializer.WritePropertyWithDefault(100, "ignore_errors", ignore_errors, false); serializer.WritePropertyWithDefault(101, "buffer_sample_size", buffer_sample_size); serializer.WritePropertyWithDefault(102, "null_str", null_str); serializer.WriteProperty(103, "compression", compression); @@ -135,7 +135,7 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { serializer.WritePropertyWithDefault(114, "buffer_size", buffer_size); serializer.WriteProperty(115, "file_options", file_options); serializer.WritePropertyWithDefault>(116, "force_quote", force_quote); - serializer.WritePropertyWithDefault>(117, "rejects_table_name", rejects_table_name, {"reject_errors"}); + serializer.WritePropertyWithDefault(117, "rejects_table_name", rejects_table_name, "reject_errors"); serializer.WritePropertyWithDefault(118, "rejects_limit", rejects_limit); /* [Deleted] (vector) "rejects_recovery_columns" */ /* [Deleted] (vector) "rejects_recovery_column_ids" */ @@ -155,7 +155,7 @@ void CSVReaderOptions::Serialize(Serializer &serializer) const { CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { CSVReaderOptions result; - deserializer.ReadPropertyWithDefault>(100, "ignore_errors", result.ignore_errors, false); + deserializer.ReadPropertyWithDefault(100, "ignore_errors", result.ignore_errors, false); deserializer.ReadPropertyWithDefault(101, "buffer_sample_size", result.buffer_sample_size); deserializer.ReadPropertyWithDefault(102, "null_str", result.null_str); deserializer.ReadProperty(103, "compression", result.compression); @@ -172,7 +172,7 @@ CSVReaderOptions CSVReaderOptions::Deserialize(Deserializer &deserializer) { deserializer.ReadPropertyWithDefault(114, "buffer_size", result.buffer_size); deserializer.ReadProperty(115, "file_options", result.file_options); deserializer.ReadPropertyWithDefault>(116, "force_quote", result.force_quote); - deserializer.ReadPropertyWithDefault>(117, "rejects_table_name", result.rejects_table_name, {"reject_errors"}); + deserializer.ReadPropertyWithDefault(117, "rejects_table_name", result.rejects_table_name, "reject_errors"); deserializer.ReadPropertyWithDefault(118, "rejects_limit", result.rejects_limit); deserializer.ReadDeletedProperty>(119, "rejects_recovery_columns"); deserializer.ReadDeletedProperty>(120, "rejects_recovery_column_ids"); From 41cd77ae336fb61aae34947116862f17144d1c0d Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Tue, 9 Apr 2024 12:57:07 +0200 Subject: [PATCH 092/147] One more test still having scan_ids --- .../csv_incorrect_columns_amount_rejects.test | 84 +++++++++---------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test index 2b59e17547d3..f1b63112a4c4 100644 --- a/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test +++ b/test/sql/copy/csv/rejects/csv_incorrect_columns_amount_rejects.test @@ -13,13 +13,13 @@ SELECT * FROM read_csv( columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, store_rejects=true, auto_detect=false, header = 1); -query IIIIIIIIII rowsort -FROM reject_errors order by all; +query IIIIIIIII rowsort +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -3 0 1814 14505 14510 3 d MISSING COLUMNS 1,2,3 Expected Number of Columns: 4 Found: 3 -3 0 1823 14575 14576 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 -3 0 2378 19009 19010 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 -3 0 2762 22075 22078 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 +0 1814 14505 14510 3 d MISSING COLUMNS 1,2,3 Expected Number of Columns: 4 Found: 3 +0 1823 14575 14576 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +0 2378 19009 19010 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +0 2762 22075 22078 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 statement ok DROP TABLE reject_errors; @@ -33,15 +33,15 @@ SELECT * FROM read_csv( columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, store_rejects=true, auto_detect=false, header = 1); -query IIIIIIIIII rowsort -FROM reject_errors order by all; +query IIIIIIIII rowsort +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -7 0 1096 8761 8768 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 -7 0 1096 8761 8770 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 -7 0 1159 9269 9276 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 -7 0 1159 9269 9278 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 -7 0 1206 9649 9656 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 -7 0 2769 22155 22162 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 +0 1096 8761 8768 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +0 1096 8761 8770 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +0 1159 9269 9276 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +0 1159 9269 9278 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +0 1206 9649 9656 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 +0 2769 22155 22162 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 statement ok DROP TABLE reject_errors; @@ -55,15 +55,15 @@ SELECT * FROM read_csv( columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, store_rejects=true, auto_detect=false, header = 1); -query IIIIIIIIII rowsort -FROM reject_errors order by all; +query IIIIIIIII rowsort +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all; ---- -11 0 1604 12825 12826 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 -11 0 1671 13355 13362 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 -11 0 1671 13355 13364 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 -11 0 2751 21999 22002 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 -11 0 2768 22131 22138 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 -11 0 2768 22131 22140 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +0 1604 12825 12826 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +0 1671 13355 13362 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +0 1671 13355 13364 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +0 2751 21999 22002 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 +0 2768 22131 22138 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +0 2768 22131 22140 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 # Different Buffer Sizes @@ -102,24 +102,24 @@ SELECT * FROM read_csv( columns = {'a': 'INTEGER', 'b': 'INTEGER', 'c': 'INTEGER', 'd': 'INTEGER'}, store_rejects=true, auto_detect=false, header = 1); -query IIIIIIIIII rowsort -FROM reject_errors order by all +query IIIIIIIII rowsort +SELECT * EXCLUDE (scan_id) FROM reject_errors order by all ---- -35 0 1814 14505 14510 3 d MISSING COLUMNS 1,2,3 Expected Number of Columns: 4 Found: 3 -35 0 1823 14575 14576 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 -35 0 2378 19009 19010 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 -35 0 2762 22075 22078 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 -35 1 1096 8761 8768 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 -35 1 1096 8761 8770 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 -35 1 1159 9269 9276 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 -35 1 1159 9269 9278 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 -35 1 1206 9649 9656 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 -35 1 2769 22155 22162 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 -35 2 1604 12825 12826 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 -35 2 1671 13355 13362 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 -35 2 1671 13355 13364 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 -35 2 2751 21999 22002 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 -35 2 2768 22131 22138 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 -35 2 2768 22131 22140 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 -35 3 3 17 24 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 -35 3 4 27 32 3 d MISSING COLUMNS 1,2,3 Expected Number of Columns: 4 Found: 3 +0 1814 14505 14510 3 d MISSING COLUMNS 1,2,3 Expected Number of Columns: 4 Found: 3 +0 1823 14575 14576 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +0 2378 19009 19010 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +0 2762 22075 22078 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 +1 1096 8761 8768 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +1 1096 8761 8770 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +1 1159 9269 9276 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +1 1159 9269 9278 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +1 1206 9649 9656 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 +1 2769 22155 22162 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 +2 1604 12825 12826 1 b MISSING COLUMNS 1 Expected Number of Columns: 4 Found: 1 +2 1671 13355 13362 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +2 1671 13355 13364 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +2 2751 21999 22002 2 c MISSING COLUMNS 1,2 Expected Number of Columns: 4 Found: 2 +2 2768 22131 22138 5 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 5 +2 2768 22131 22140 6 NULL TOO MANY COLUMNS 1,2,3,4,5,6 Expected Number of Columns: 4 Found: 6 +3 3 17 24 5 NULL TOO MANY COLUMNS 1,2,3,4,5 Expected Number of Columns: 4 Found: 5 +3 4 27 32 3 d MISSING COLUMNS 1,2,3 Expected Number of Columns: 4 Found: 3 From 437f8de28428234f864571d4db66b93d103a9ba9 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 9 Apr 2024 22:05:46 +0200 Subject: [PATCH 093/147] Respect read-only mode in dbgen and dsdgen --- extension/tpcds/tpcds_extension.cpp | 10 +++++-- extension/tpch/tpch_extension.cpp | 12 +++++--- .../duckdb/function/table_function.hpp | 5 ++-- .../binder/tableref/bind_table_function.cpp | 2 +- src/planner/operator/logical_get.cpp | 2 +- test/sql/tpcds/dsdgen_readonly.test | 30 +++++++++++++++++++ test/sql/tpch/dbgen_readonly.test | 30 +++++++++++++++++++ 7 files changed, 80 insertions(+), 11 deletions(-) create mode 100644 test/sql/tpcds/dsdgen_readonly.test create mode 100644 test/sql/tpch/dbgen_readonly.test diff --git a/extension/tpcds/tpcds_extension.cpp b/extension/tpcds/tpcds_extension.cpp index c6157b67ae93..44c92d91333c 100644 --- a/extension/tpcds/tpcds_extension.cpp +++ b/extension/tpcds/tpcds_extension.cpp @@ -43,13 +43,17 @@ static duckdb::unique_ptr DsdgenBind(ClientContext &context, Table result->keys = kv.second.GetValue(); } } + if (input.binder) { + auto &catalog = Catalog::GetCatalog(context, result->catalog); + input.binder->properties.modified_databases.insert(catalog.GetName()); + } return_types.emplace_back(LogicalType::BOOLEAN); names.emplace_back("Success"); return std::move(result); } static void DsdgenFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { - auto &data = (DSDGenFunctionData &)*data_p.bind_data; + auto &data = data_p.bind_data->CastNoConst(); if (data.finished) { return; } @@ -82,7 +86,7 @@ static duckdb::unique_ptr TPCDSQueryBind(ClientContext &context, T } static void TPCDSQueryFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { - auto &data = (TPCDSData &)*data_p.global_state; + auto &data = data_p.global_state->Cast(); idx_t tpcds_queries = tpcds::DSDGenWrapper::QueriesCount(); if (data.offset >= tpcds_queries) { // finished returning values @@ -116,7 +120,7 @@ static duckdb::unique_ptr TPCDSQueryAnswerBind(ClientContext &cont } static void TPCDSQueryAnswerFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { - auto &data = (TPCDSData &)*data_p.global_state; + auto &data = data_p.global_state->Cast(); idx_t tpcds_queries = tpcds::DSDGenWrapper::QueriesCount(); vector scale_factors {1, 10}; idx_t total_answers = tpcds_queries * scale_factors.size(); diff --git a/extension/tpch/tpch_extension.cpp b/extension/tpch/tpch_extension.cpp index 68548438ab20..e1d6016b0681 100644 --- a/extension/tpch/tpch_extension.cpp +++ b/extension/tpch/tpch_extension.cpp @@ -8,6 +8,7 @@ #include "duckdb/parser/parser.hpp" #include "duckdb/parser/statement/select_statement.hpp" #include "duckdb/main/extension_util.hpp" +#include "duckdb/transaction/transaction.hpp" #endif #include "dbgen/dbgen.hpp" @@ -51,14 +52,17 @@ static duckdb::unique_ptr DbgenBind(ClientContext &context, TableF if (result->children != 1 && result->step == -1) { throw InvalidInputException("Step must be defined when children are defined"); } - + if (input.binder) { + auto &catalog = Catalog::GetCatalog(context, result->catalog); + input.binder->properties.modified_databases.insert(catalog.GetName()); + } return_types.emplace_back(LogicalType::BOOLEAN); names.emplace_back("Success"); return std::move(result); } static void DbgenFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { - auto &data = (DBGenFunctionData &)*data_p.bind_data; + auto &data = data_p.bind_data->CastNoConst(); if (data.finished) { return; } @@ -92,7 +96,7 @@ static duckdb::unique_ptr TPCHQueryBind(ClientContext &context, Ta } static void TPCHQueryFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { - auto &data = (TPCHData &)*data_p.global_state; + auto &data = data_p.global_state->Cast(); idx_t tpch_queries = 22; if (data.offset >= tpch_queries) { // finished returning values @@ -126,7 +130,7 @@ static duckdb::unique_ptr TPCHQueryAnswerBind(ClientContext &conte } static void TPCHQueryAnswerFunction(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { - auto &data = (TPCHData &)*data_p.global_state; + auto &data = data_p.global_state->Cast(); idx_t tpch_queries = 22; vector scale_factors {0.01, 0.1, 1}; idx_t total_answers = tpch_queries * scale_factors.size(); diff --git a/src/include/duckdb/function/table_function.hpp b/src/include/duckdb/function/table_function.hpp index 30cc0d9f8df7..3321d274f902 100644 --- a/src/include/duckdb/function/table_function.hpp +++ b/src/include/duckdb/function/table_function.hpp @@ -83,9 +83,9 @@ struct LocalTableFunctionState { struct TableFunctionBindInput { TableFunctionBindInput(vector &inputs, named_parameter_map_t &named_parameters, vector &input_table_types, vector &input_table_names, - optional_ptr info) + optional_ptr info, optional_ptr binder) : inputs(inputs), named_parameters(named_parameters), input_table_types(input_table_types), - input_table_names(input_table_names), info(info) { + input_table_names(input_table_names), info(info), binder(binder) { } vector &inputs; @@ -93,6 +93,7 @@ struct TableFunctionBindInput { vector &input_table_types; vector &input_table_names; optional_ptr info; + optional_ptr binder; }; struct TableFunctionInitInput { diff --git a/src/planner/binder/tableref/bind_table_function.cpp b/src/planner/binder/tableref/bind_table_function.cpp index 16a60ce1cd19..be9848683b54 100644 --- a/src/planner/binder/tableref/bind_table_function.cpp +++ b/src/planner/binder/tableref/bind_table_function.cpp @@ -140,7 +140,7 @@ Binder::BindTableFunctionInternal(TableFunction &table_function, const string &f vector return_names; if (table_function.bind || table_function.bind_replace) { TableFunctionBindInput bind_input(parameters, named_parameters, input_table_types, input_table_names, - table_function.function_info.get()); + table_function.function_info.get(), this); if (table_function.bind_replace) { auto new_plan = table_function.bind_replace(context, bind_input); if (new_plan != nullptr) { diff --git a/src/planner/operator/logical_get.cpp b/src/planner/operator/logical_get.cpp index 1ee9256189ed..077eb6cbbaa9 100644 --- a/src/planner/operator/logical_get.cpp +++ b/src/planner/operator/logical_get.cpp @@ -165,7 +165,7 @@ unique_ptr LogicalGet::Deserialize(Deserializer &deserializer) deserializer.ReadProperty(208, "input_table_types", result->input_table_types); deserializer.ReadProperty(209, "input_table_names", result->input_table_names); TableFunctionBindInput input(result->parameters, result->named_parameters, result->input_table_types, - result->input_table_names, function.function_info.get()); + result->input_table_names, function.function_info.get(), nullptr); vector bind_return_types; vector bind_names; diff --git a/test/sql/tpcds/dsdgen_readonly.test b/test/sql/tpcds/dsdgen_readonly.test new file mode 100644 index 000000000000..a037cade63ec --- /dev/null +++ b/test/sql/tpcds/dsdgen_readonly.test @@ -0,0 +1,30 @@ +# name: test/sql/tpcds/dsdgen_readonly.test +# description: Test that dsdgen respects read-only mode +# group: [tpcds] + +require tpcds + +load __TEST_DIR__/test_dsdgen_ro.db + +statement ok +CREATE TABLE tbl (i INTEGER); + +load __TEST_DIR__/test_dsdgen_ro.db readonly + +statement error +CALL dsdgen(sf=0); +---- +read-only + +load + +statement ok +ATTACH '__TEST_DIR__/test_dsdgen_ro.db' AS dsdgentest (READ_ONLY) + +statement error +CALL dsdgen(sf=0, catalog='dsdgentest'); +---- +read-only + +statement ok +CALL dsdgen(sf=0); diff --git a/test/sql/tpch/dbgen_readonly.test b/test/sql/tpch/dbgen_readonly.test new file mode 100644 index 000000000000..de5ecc8d3377 --- /dev/null +++ b/test/sql/tpch/dbgen_readonly.test @@ -0,0 +1,30 @@ +# name: test/sql/tpch/dbgen_readonly.test +# description: Test that dbgen respects read-only mode +# group: [tpch] + +require tpch + +load __TEST_DIR__/test_dbgen_ro.db + +statement ok +CREATE TABLE tbl (i INTEGER); + +load __TEST_DIR__/test_dbgen_ro.db readonly + +statement error +CALL dbgen(sf=0); +---- +read-only + +load + +statement ok +ATTACH '__TEST_DIR__/test_dbgen_ro.db' AS dbgentest (READ_ONLY) + +statement error +CALL dbgen(sf=0, catalog='dbgentest'); +---- +read-only + +statement ok +CALL dbgen(sf=0); From c4a109f97e017e9ba6733aceafca7c803d31bd62 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 9 Apr 2024 22:22:43 +0200 Subject: [PATCH 094/147] Correctly handle quoted database names in USE statement --- src/catalog/catalog_search_path.cpp | 5 +++ .../transform/statement/transform_use.cpp | 5 ++- test/sql/attach/attach_dbname_quotes.test | 42 +++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 test/sql/attach/attach_dbname_quotes.test diff --git a/src/catalog/catalog_search_path.cpp b/src/catalog/catalog_search_path.cpp index 6be5f491c0f0..9368e300f4ad 100644 --- a/src/catalog/catalog_search_path.cpp +++ b/src/catalog/catalog_search_path.cpp @@ -67,6 +67,11 @@ CatalogSearchEntry CatalogSearchEntry::ParseInternal(const string &input, idx_t if (input[idx] == '"') { //! unquote idx++; + if (idx < input.size() && input[idx] == '"') { + // escaped quote + entry += input[idx]; + continue; + } goto normal; } entry += input[idx]; diff --git a/src/parser/transform/statement/transform_use.cpp b/src/parser/transform/statement/transform_use.cpp index a4d76843725f..4bb8a2b86414 100644 --- a/src/parser/transform/statement/transform_use.cpp +++ b/src/parser/transform/statement/transform_use.cpp @@ -11,9 +11,10 @@ unique_ptr Transformer::TransformUse(duckdb_libpgquery::PGUseStmt } string name; if (IsInvalidSchema(qualified_name.schema)) { - name = qualified_name.name; + name = KeywordHelper::WriteOptionallyQuoted(qualified_name.name, '"'); } else { - name = qualified_name.schema + "." + qualified_name.name; + name = KeywordHelper::WriteOptionallyQuoted(qualified_name.schema, '"') + "." + + KeywordHelper::WriteOptionallyQuoted(qualified_name.name, '"'); } auto name_expr = make_uniq(Value(name)); return make_uniq("schema", std::move(name_expr), SetScope::AUTOMATIC); diff --git a/test/sql/attach/attach_dbname_quotes.test b/test/sql/attach/attach_dbname_quotes.test new file mode 100644 index 000000000000..6c1069dd95c6 --- /dev/null +++ b/test/sql/attach/attach_dbname_quotes.test @@ -0,0 +1,42 @@ +# name: test/sql/attach/attach_dbname_quotes.test +# description: Test ATTACH with a quoted database name +# group: [attach] + +require noforcestorage + +statement ok +ATTACH ':memory:' as "my""db"; + +statement ok +CREATE TABLE "my""db".tbl(i int); + +statement ok +INSERT INTO "my""db".tbl VALUES (42) + +statement ok +USE "my""db"; + +query I +SELECT * FROM tbl +---- +42 + +statement ok +USE memory + +statement ok +CREATE SCHEMA "my""db"."my""schema" + +statement ok +CREATE TABLE "my""db"."my""schema".tbl(i int); + +statement ok +INSERT INTO "my""db"."my""schema".tbl VALUES (84) + +statement ok +USE "my""db"."my""schema" + +query I +SELECT * FROM tbl +---- +84 From 1ce844a7e7a9da99e43f7e925e261a576a02513b Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 9 Apr 2024 22:23:40 +0200 Subject: [PATCH 095/147] Only quotes --- test/sql/attach/attach_dbname_quotes.test | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/sql/attach/attach_dbname_quotes.test b/test/sql/attach/attach_dbname_quotes.test index 6c1069dd95c6..2c4157dcb1ac 100644 --- a/test/sql/attach/attach_dbname_quotes.test +++ b/test/sql/attach/attach_dbname_quotes.test @@ -40,3 +40,9 @@ query I SELECT * FROM tbl ---- 84 + +statement ok +CREATE SCHEMA """" + +statement ok +USE """" From eb760b8aae9eed69ef5087168d4085d86e991468 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 9 Apr 2024 23:54:16 +0200 Subject: [PATCH 096/147] Test fix --- test/sql/attach/attach_table_info.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/sql/attach/attach_table_info.test b/test/sql/attach/attach_table_info.test index d8609aab98be..7f312f0d530c 100644 --- a/test/sql/attach/attach_table_info.test +++ b/test/sql/attach/attach_table_info.test @@ -38,7 +38,7 @@ SELECT current_database() memory statement ok -USE "new_database.new_schema" +USE new_database.new_schema query ITTTTT nosort table_info PRAGMA table_info('integers'); From 684c10bb92d0fc11c351ee19794edcb69c1babbf Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Tue, 9 Apr 2024 23:54:52 +0200 Subject: [PATCH 097/147] Fix for Python client bind --- tools/pythonpkg/src/python_udf.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/pythonpkg/src/python_udf.cpp b/tools/pythonpkg/src/python_udf.cpp index 8850e8e0218d..45f1f40c6f16 100644 --- a/tools/pythonpkg/src/python_udf.cpp +++ b/tools/pythonpkg/src/python_udf.cpp @@ -65,7 +65,7 @@ static void ConvertPyArrowToDataChunk(const py::object &table, Vector &out, Clie vector input_types; vector input_names; - auto bind_input = TableFunctionBindInput(children, named_params, input_types, input_names, nullptr); + TableFunctionBindInput bind_input(children, named_params, input_types, input_names, nullptr, nullptr); vector return_types; vector return_names; From 797aa1e571d7a89ee094b958b4ad4f130913f914 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Wed, 10 Apr 2024 08:46:21 +0200 Subject: [PATCH 098/147] skip_reload because of ATTACH --- test/sql/tpcds/dsdgen_readonly.test | 2 ++ test/sql/tpch/dbgen_readonly.test | 2 ++ 2 files changed, 4 insertions(+) diff --git a/test/sql/tpcds/dsdgen_readonly.test b/test/sql/tpcds/dsdgen_readonly.test index a037cade63ec..ec0c448d8e2e 100644 --- a/test/sql/tpcds/dsdgen_readonly.test +++ b/test/sql/tpcds/dsdgen_readonly.test @@ -4,6 +4,8 @@ require tpcds +require skip_reload + load __TEST_DIR__/test_dsdgen_ro.db statement ok diff --git a/test/sql/tpch/dbgen_readonly.test b/test/sql/tpch/dbgen_readonly.test index de5ecc8d3377..33cdb551c609 100644 --- a/test/sql/tpch/dbgen_readonly.test +++ b/test/sql/tpch/dbgen_readonly.test @@ -4,6 +4,8 @@ require tpch +require skip_reload + load __TEST_DIR__/test_dbgen_ro.db statement ok From 9562296f692c5acba20b60b296e80585da803cc3 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Wed, 10 Apr 2024 09:35:22 +0200 Subject: [PATCH 099/147] [CI] Remove GITHUB_PAT variable from R-CMD-check --- .github/workflows/R_CMD_CHECK.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/R_CMD_CHECK.yml b/.github/workflows/R_CMD_CHECK.yml index a5fe7c191be1..d111e135547a 100644 --- a/.github/workflows/R_CMD_CHECK.yml +++ b/.github/workflows/R_CMD_CHECK.yml @@ -51,7 +51,6 @@ jobs: - {os: ubuntu-latest, r: 'release'} env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes steps: From 1a1cd7b565b5b7f6f87f85f249cd2fa46f6f0965 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Wed, 10 Apr 2024 09:43:00 +0200 Subject: [PATCH 100/147] Remove quotes in tests --- test/sql/attach/attach_did_you_mean.test | 2 +- test/sql/attach/attach_nested_types.test | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/sql/attach/attach_did_you_mean.test b/test/sql/attach/attach_did_you_mean.test index 26c418a4ba6f..967d2103e3a5 100644 --- a/test/sql/attach/attach_did_you_mean.test +++ b/test/sql/attach/attach_did_you_mean.test @@ -55,7 +55,7 @@ SELECT * FROM memory.hello # what if we switch default database AND default schema? statement ok -USE "db1.myschema" +USE db1.myschema statement ok SELECT * FROM blablabla diff --git a/test/sql/attach/attach_nested_types.test b/test/sql/attach/attach_nested_types.test index 9b63c6e8c052..99369a286c99 100644 --- a/test/sql/attach/attach_nested_types.test +++ b/test/sql/attach/attach_nested_types.test @@ -49,7 +49,7 @@ SELECT "table" FROM database.schema.table {'col': {'field': 42}} statement ok -USE "database.schema" +USE database.schema query I SELECT "table" FROM "table" From 6f3e37475d983fece4b967ed7a438b3379b3bc4d Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Wed, 10 Apr 2024 10:17:32 +0200 Subject: [PATCH 101/147] Bump-back duckdb_azure to pre-lzma custom vcpkg-port moving bminor mirror --- .github/config/out_of_tree_extensions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake index 131d84e90812..c94a1e029af6 100644 --- a/.github/config/out_of_tree_extensions.cmake +++ b/.github/config/out_of_tree_extensions.cmake @@ -38,7 +38,7 @@ if (NOT MINGW) duckdb_extension_load(azure LOAD_TESTS GIT_URL https://github.com/duckdb/duckdb_azure - GIT_TAG 6620a32454c1eb2e455104d87262061d2464aad0 + GIT_TAG 506b1fa0f3f892000130feac7a0e1de346095e80 APPLY_PATCHES ) endif() From f41e41be8f49d6dc9f7bdd23dcc992aaefa098bc Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Wed, 10 Apr 2024 11:17:53 +0200 Subject: [PATCH 102/147] Bump-forward duckdb_azure to main, post-lzma custom vcpkg-port moving mirror --- .github/config/out_of_tree_extensions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake index c94a1e029af6..5d1795ef1168 100644 --- a/.github/config/out_of_tree_extensions.cmake +++ b/.github/config/out_of_tree_extensions.cmake @@ -38,7 +38,7 @@ if (NOT MINGW) duckdb_extension_load(azure LOAD_TESTS GIT_URL https://github.com/duckdb/duckdb_azure - GIT_TAG 506b1fa0f3f892000130feac7a0e1de346095e80 + GIT_TAG 4512a652479016d40d712f990cab9b9aab43d341 APPLY_PATCHES ) endif() From bbd74ac95dc58000d7add23c2ecf099a5ff284a5 Mon Sep 17 00:00:00 2001 From: Christina Sioula Date: Fri, 5 Apr 2024 12:59:37 +0200 Subject: [PATCH 103/147] support reading gzipped files with the test runner --- .gitignore | 3 +- data/storage/index_0-9-1.db.gz | Bin 0 -> 3332 bytes data/storage/test.db.gz | Bin 0 -> 591 bytes test/sql/storage/unzip.test | 62 +++++++++++++++++++++++++++ test/sqlite/sqllogic_command.cpp | 36 ++++++++++++++++ test/sqlite/sqllogic_command.hpp | 16 +++++++ test/sqlite/sqllogic_parser.cpp | 3 ++ test/sqlite/sqllogic_parser.hpp | 3 +- test/sqlite/sqllogic_test_runner.cpp | 34 +++++++++++++-- 9 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 data/storage/index_0-9-1.db.gz create mode 100644 data/storage/test.db.gz create mode 100644 test/sql/storage/unzip.test diff --git a/.gitignore b/.gitignore index 67114a06f137..f754b6ec73c4 100644 --- a/.gitignore +++ b/.gitignore @@ -339,9 +339,10 @@ zig-cache/* # .db files *.db +*.db.gz # local cmake extension config extension/extension_config_local.cmake # extension_external dir -extension_external \ No newline at end of file +extension_external diff --git a/data/storage/index_0-9-1.db.gz b/data/storage/index_0-9-1.db.gz new file mode 100644 index 0000000000000000000000000000000000000000..c4f963ac8cc47a3dc49c7d763924dfe3a9547da8 GIT binary patch literal 3332 zcmd^<{WlwC8^_bqO=}aIJ~}PUTJ^F;Lx*s@WkTyQDlq~m zjN79D2C^ytM9Ww!z;beoz7LX;n|r++o?q@~6DsiXJ$xj=&zeRxw8!kg#$8;Rn4ah= zE;#(XWm`Y(bKk?3XR})R{e!*(`FcTafBmyNaKcP6;$Ki+TyPdYxXaO5YJpGh_J?-W zcGU_4>t5)31Zwt1sdtTjxU^Cbm@T7htn)orf!;Zyrpli^V>uxA_1+V6O3DGsJs)V! z)ksZTmDNrF0O0wK1kwcuKr?YJZ~nni{cnZ0HIZEm#a}8tXNfY7dpYZLz7^uF z&TlYB__~HvAbIHLecA(M0#nJWGZpm>Zf@KvGCY1X#nxh(#(a5iH%Z{qdDb}DvHpOq zaTQw_9b+&&GcQfr#`ftZ6jm&U$_Z0;M%q>JiA#Im(AkjQNX!rC)c%-@$~J_U48eM==DmP?Aj(hnxQIl z$hG=LAT_+a=F!I0)6ea}7#DL%<=3xc5c6p}0Sh~a-+Sy>EMSM*wSg8^22(d0+QzSG zk4Ip`Z1J^#Ui_SOFI2E-eErEWp42e+$(IC^uUiRZ<#%xRkkFh6w0-v3-%R=ba-}3V z=2UD^M-ugw*S;!|&ap+Pe8?bX_^tU>>F4yb(&0NTrI+50Y^2T_Iv+o71+}NOhiG6a z5R%BvUp~GsR2)T99k_6vrnFLC*Ne$(F#dh9=Hxun(u233SS6ytb)9gX)FCOe z|4O&m`o_zwGL&ErwDGH!ba}Q7xESHo|@HIy#SZ8#1`E zQrIkm46d%@&)tQsg-DCtPRn1@aMxI_KIe*u??i6Ke8ZlvH=t#P)+zSg)$SQ2^`%@q zKU7kgHzH{S@+~$$g6(~By(K4Ac-+julJ~LyIS`MGM)Go;s(mT~lB4A_Kd{`YZ=FPA zJxVGYE9gCT)z2y#VbwP}a4iULq-R*o2VWJp?%Li8Z9$j*5KRH{slg!asOP5u24v6R zp-=2aXBIGwieY0+BRkRO-UDG+B9Zs{{{4aX?&wQ0C8Y1*H)Klg@75*i=X+gye1GiB zw5c)JV$XIS)5Co-ft0%P%SCd^}SSl0lPWh;_674i3@zfmJy~T6e zdAyiHd4hXG8U@y&m;%6MDfJ42w*n2j$&=(8E9XVz^+ccZO?6;4+i_?DiGKoaD=L-P zO@?3~uxhFV$^OE(`?G&@BvClF<_&?cE(2PIVE}DuLFY{lLAtq!l}BqA-cHA*i~6xN dReSViC_VLRqwQ>@E!d1^2z?0cX8Vr<(@# zoSi49g$K+#YW#ucp5x8Po#+1SY)drhQme7^J-hz%?*F@XzqkAzW1Ay>`S0Q7Q_nu# zy3}CK=Agoj5mI;bL&ct*dV1^5pFaP3<`Tbty=Z?9266_%U#Jqp?6TeEH z4Q2%L?!T-nd;G5A*{+Gt|9W|xR$SYnBDOWa;?7O$Td&V>O^Hrk8*eHy>+0pS_0_`1 zX6~G?1Y_UjO^?n0Z%{{Ot2x zzi$y-eL;@%tHg)fZ$Iknu3cGm@ZV(N>idF64o^B|aD7eLbBRZRa~~B|m1fu0UU|>? z!~dAuUj~K)*_A6!n5#eEs8qT6l*Prk<}C*}-t+w0x4(Y>|HZW+TSgu@Aa2W?k>_}a I!GMJU0N>E9%K!iX literal 0 HcmV?d00001 diff --git a/test/sql/storage/unzip.test b/test/sql/storage/unzip.test new file mode 100644 index 000000000000..6592fc6c672c --- /dev/null +++ b/test/sql/storage/unzip.test @@ -0,0 +1,62 @@ +# name: test/sql/storage/unzip.test +# description: Support gzipped files in the test runner +# group: [storage] + +# unzip to specific path +unzip data/storage/test.db.gz __TEST_DIR__/test.db + +load __TEST_DIR__/test.db readonly + +query I +SELECT a+1 FROM tbl; +---- +6 + +# unzip a 1.8M file to default extraction path -> __TEST_DIR__/ +unzip data/storage/index_0-9-1.db.gz + +load __TEST_DIR__/index_0-9-1.db readonly + +query II +SELECT table_name, index_count FROM duckdb_tables() ORDER BY table_name; +---- +fk_tbl 1 +idx_tbl 2 +pk_tbl 2 + +# unzip to default extraction path from NULL input +unzip data/storage/test.db.gz NULL + +load __TEST_DIR__/test.db readonly + +query I +SELECT a+2 FROM tbl; +---- +7 + +## test invalid use +# unzip + +## not gzipped database +# unzip data/storage/test.db + +## not gzipped database +# unzip data/storage/test.db + +## test NULL input paths +# unzip NULL +# unzip NULL NULL +# unzip NULL data/storage/test.db + +## invalid input path +# unzip path/to/nowhere data/storage/not_existed.db + +## invalid extraction path +# unzip data/storage/test.db.gz path/to/nowhere + +## already existed database file in the extraction - warning: this will overwrite existed wal_test_092.db +# unzip data/storage/test.db.gz data/storage/wal_test_092.db + +## extraction path to directory +# unzip data/storage/test.db.gz __TEST_DIR__/ +# unzip data/storage/test.db.gz __TEST_DIR__ diff --git a/test/sqlite/sqllogic_command.cpp b/test/sqlite/sqllogic_command.cpp index b9f64b5f407a..bd1b3e459729 100644 --- a/test/sqlite/sqllogic_command.cpp +++ b/test/sqlite/sqllogic_command.cpp @@ -144,6 +144,10 @@ SleepCommand::SleepCommand(SQLLogicTestRunner &runner, idx_t duration, SleepUnit : Command(runner), duration(duration), unit(unit) { } +UnzipCommand::UnzipCommand(SQLLogicTestRunner &runner, string &input, string &output) + : Command(runner), input_path(input), extraction_path(output) { +} + struct ParallelExecuteContext { ParallelExecuteContext(SQLLogicTestRunner &runner, const vector> &loop_commands, LoopDefinition definition) @@ -387,4 +391,36 @@ void Statement::ExecuteInternal(ExecuteContext &context) const { } } +void UnzipCommand::ExecuteInternal(ExecuteContext &context) const { + VirtualFileSystem vfs; + + // input + FileOpenFlags in_flags(FileFlags::FILE_FLAGS_READ); + in_flags.SetCompression(FileCompressionType::GZIP); + auto compressed_file_handle = vfs.OpenFile(input_path, in_flags); + if (compressed_file_handle == nullptr) { + throw CatalogException("Cannot open the file \"%s\"", input_path); + } + + // read the compressed data from the file + int64_t file_size = vfs.GetFileSize(*compressed_file_handle); + std::unique_ptr compressed_buffer(new char[BUFFER_SIZE]); + int64_t bytes_read = vfs.Read(*compressed_file_handle, compressed_buffer.get(), BUFFER_SIZE); + if (bytes_read < file_size) { + throw CatalogException("Cannot read the file \"%s\"", input_path); + } + + // output + FileOpenFlags out_flags(FileOpenFlags::FILE_FLAGS_FILE_CREATE | FileOpenFlags::FILE_FLAGS_WRITE); + auto output_file = vfs.OpenFile(extraction_path, out_flags); + if (!output_file) { + throw CatalogException("Cannot open the file \"%s\"", extraction_path); + } + + int64_t bytes_written = vfs.Write(*output_file, compressed_buffer.get(), BUFFER_SIZE); + if (bytes_written < file_size) { + throw CatalogException("Cannot write the file \"%s\"", extraction_path); + } +} + } // namespace duckdb diff --git a/test/sqlite/sqllogic_command.hpp b/test/sqlite/sqllogic_command.hpp index 0f45d8aa2755..d6b2e6c9f71a 100644 --- a/test/sqlite/sqllogic_command.hpp +++ b/test/sqlite/sqllogic_command.hpp @@ -9,6 +9,7 @@ #pragma once #include "duckdb.hpp" +#include "duckdb/common/virtual_file_system.hpp" namespace duckdb { class SQLLogicTestRunner; @@ -142,4 +143,19 @@ class SleepCommand : public Command { SleepUnit unit; }; +class UnzipCommand : public Command { +public: + // 1 MB + static constexpr const int64_t BUFFER_SIZE = 1u << 23; + +public: + UnzipCommand(SQLLogicTestRunner &runner, string &input, string &output); + + void ExecuteInternal(ExecuteContext &context) const override; + +private: + string input_path; + string extraction_path; +}; + } // namespace duckdb diff --git a/test/sqlite/sqllogic_parser.cpp b/test/sqlite/sqllogic_parser.cpp index 8cccc18ff64e..8fdb3f1bb85c 100644 --- a/test/sqlite/sqllogic_parser.cpp +++ b/test/sqlite/sqllogic_parser.cpp @@ -166,6 +166,7 @@ bool SQLLogicParser::IsSingleLineStatement(SQLLogicToken &token) { case SQLLogicTokenType::SQLLOGIC_RESTART: case SQLLogicTokenType::SQLLOGIC_RECONNECT: case SQLLogicTokenType::SQLLOGIC_SLEEP: + case SQLLogicTokenType::SQLLOGIC_UNZIP: return true; case SQLLogicTokenType::SQLLOGIC_SKIP_IF: @@ -219,6 +220,8 @@ SQLLogicTokenType SQLLogicParser::CommandToToken(const string &token) { return SQLLogicTokenType::SQLLOGIC_RECONNECT; } else if (token == "sleep") { return SQLLogicTokenType::SQLLOGIC_SLEEP; + } else if (token == "unzip") { + return SQLLogicTokenType::SQLLOGIC_UNZIP; } Fail("Unrecognized parameter %s", token); return SQLLogicTokenType::SQLLOGIC_INVALID; diff --git a/test/sqlite/sqllogic_parser.hpp b/test/sqlite/sqllogic_parser.hpp index af32e943c509..87e79be2b3a1 100644 --- a/test/sqlite/sqllogic_parser.hpp +++ b/test/sqlite/sqllogic_parser.hpp @@ -34,7 +34,8 @@ enum class SQLLogicTokenType { SQLLOGIC_LOAD, SQLLOGIC_RESTART, SQLLOGIC_RECONNECT, - SQLLOGIC_SLEEP + SQLLOGIC_SLEEP, + SQLLOGIC_UNZIP }; class SQLLogicToken { diff --git a/test/sqlite/sqllogic_test_runner.cpp b/test/sqlite/sqllogic_test_runner.cpp index 7fbbc621445f..6c7f32d38859 100644 --- a/test/sqlite/sqllogic_test_runner.cpp +++ b/test/sqlite/sqllogic_test_runner.cpp @@ -1,12 +1,14 @@ #include "catch.hpp" - -#include "sqllogic_test_runner.hpp" #include "test_helpers.hpp" +#include "sqllogic_parser.hpp" +#include "sqllogic_test_runner.hpp" #include "duckdb/main/extension_helper.hpp" #include "duckdb/main/extension/generated_extension_loader.hpp" #include "duckdb/main/extension_entries.hpp" -#include "sqllogic_parser.hpp" +#include "duckdb/common/virtual_file_system.hpp" +#include "duckdb/common/file_open_flags.hpp" + #ifdef DUCKDB_OUT_OF_TREE #include DUCKDB_EXTENSION_HEADER #endif @@ -732,6 +734,32 @@ void SQLLogicTestRunner::ExecuteFile(string script) { auto sleep_unit = SleepCommand::ParseUnit(token.parameters[1]); auto command = make_uniq(*this, sleep_duration, sleep_unit); ExecuteCommand(std::move(command)); + } else if (token.type == SQLLogicTokenType::SQLLOGIC_UNZIP) { + if (token.parameters.size() != 1 && token.parameters.size() != 2) { + parser.Fail("unzip requires 1 argument: [optional: " + ", default: __TEST_DIR__/]"); + } + + // set input path + auto input_path = ReplaceKeywords(token.parameters[0]); + + // file name + idx_t filename_start_pos = input_path.find_last_of("/") + 1; + if (!StringUtil::EndsWith(input_path, ".gz")) { + parser.Fail("unzip: input has not a GZIP extension"); + } + string filename = input_path.substr(filename_start_pos, input_path.size() - filename_start_pos - 3); + + // extraction path + string default_extraction_path = ReplaceKeywords("__TEST_DIR__/" + filename); + string extraction_path = + (token.parameters.size() == 2) ? ReplaceKeywords(token.parameters[1]) : default_extraction_path; + if (extraction_path == "NULL") { + extraction_path = default_extraction_path; + } + + auto command = make_uniq(*this, input_path, extraction_path); + ExecuteCommand(std::move(command)); } } if (InLoop()) { From 8097fdc0baa9fb0b36a5df08df93734dc507dbca Mon Sep 17 00:00:00 2001 From: Christina Sioula Date: Wed, 10 Apr 2024 13:13:44 +0200 Subject: [PATCH 104/147] add README.md --- data/storage/README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 data/storage/README.md diff --git a/data/storage/README.md b/data/storage/README.md new file mode 100644 index 000000000000..82b1534a5a62 --- /dev/null +++ b/data/storage/README.md @@ -0,0 +1,23 @@ +# DuckDB database files +DuckDB v0.10 enables the creation of storage database files, ensuring seamless compatibility for subsequent versions to read files generated by older versions. *Note: it is recommended to compress large database files.* + +For further details, please refer to: [Storage & Compatibility](https://duckdb.org/docs/internals/storage#compatibility) + +### How to gzip a database file +``` +gzip +``` + +### How to decompress and load a gzipped database file in the test runner +``` +unzip + +load readonly +``` + +or by using the default extraction path `__TEST_DIR__/` (temporary space) +``` +unzip + +load __TEST_DIR__/ readonly +``` From 4fad019905a4482fde4aa9fb901dfa2266fce3df Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Wed, 10 Apr 2024 13:57:24 +0200 Subject: [PATCH 105/147] Add a useful comment --- test/sql/attach/attach_dbname_quotes.test | 1 + 1 file changed, 1 insertion(+) diff --git a/test/sql/attach/attach_dbname_quotes.test b/test/sql/attach/attach_dbname_quotes.test index 2c4157dcb1ac..c231f066682a 100644 --- a/test/sql/attach/attach_dbname_quotes.test +++ b/test/sql/attach/attach_dbname_quotes.test @@ -13,6 +13,7 @@ CREATE TABLE "my""db".tbl(i int); statement ok INSERT INTO "my""db".tbl VALUES (42) +# use with a table name in quotes statement ok USE "my""db"; From 75aeb028914a9b58f3eb8c43691b035cfbceeaf2 Mon Sep 17 00:00:00 2001 From: Pedro Holanda Date: Wed, 10 Apr 2024 13:58:31 +0200 Subject: [PATCH 106/147] table creation is too much for the CI, time to count --- .../copy/csv/test_multiple_big_compressed_csvs.test_slow | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow b/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow index 4c8116b7d3cd..4b0fa97dc92c 100644 --- a/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow +++ b/test/sql/copy/csv/test_multiple_big_compressed_csvs.test_slow @@ -19,8 +19,8 @@ DROP TABLE lineitem; statement ok CREATE TABLE lineitem(l_orderkey INTEGER NOT NULL, l_partkey INTEGER NOT NULL, l_suppkey INTEGER NOT NULL, l_linenumber INTEGER NOT NULL, l_quantity DECIMAL(15,2) NOT NULL, l_extendedprice DECIMAL(15,2) NOT NULL, l_discount DECIMAL(15,2) NOT NULL, l_tax DECIMAL(15,2) NOT NULL, l_returnflag VARCHAR NOT NULL, l_linestatus VARCHAR NOT NULL, l_shipdate DATE NOT NULL, l_commitdate DATE NOT NULL, l_receiptdate DATE NOT NULL, l_shipinstruct VARCHAR NOT NULL, l_shipmode VARCHAR NOT NULL, l_comment VARCHAR NOT NULL); -statement ok -INSERT INTO lineitem FROM read_csv([ +query I +select count(*) from read_csv([ '__TEST_DIR__/lineitem.csv.gz', '__TEST_DIR__/lineitem.csv.gz', '__TEST_DIR__/lineitem.csv.gz', @@ -46,8 +46,5 @@ INSERT INTO lineitem FROM read_csv([ '__TEST_DIR__/lineitem.csv.gz', '__TEST_DIR__/lineitem.csv.gz', ]); - -query I -select count(*) from lineitem ---- 144029160 \ No newline at end of file From 4bfe3f3fe3020e754d278ea39911eded230b66e0 Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Wed, 10 Apr 2024 14:06:22 +0200 Subject: [PATCH 107/147] add test --- src/optimizer/optimizer.cpp | 12 ++-- test/optimizer/arithmetic_simplification.test | 66 +++++++------------ test/optimizer/topn/complex_top_n.test | 43 ++++++++++++ test/optimizer/{ => topn}/topn_optimizer.test | 2 +- 4 files changed, 75 insertions(+), 48 deletions(-) create mode 100644 test/optimizer/topn/complex_top_n.test rename test/optimizer/{ => topn}/topn_optimizer.test (94%) diff --git a/src/optimizer/optimizer.cpp b/src/optimizer/optimizer.cpp index ce20ca7d8a5f..b4bbd37ce7d9 100644 --- a/src/optimizer/optimizer.cpp +++ b/src/optimizer/optimizer.cpp @@ -153,6 +153,12 @@ unique_ptr Optimizer::Optimize(unique_ptr plan cse_optimizer.VisitOperator(*plan); }); + // transform ORDER BY + LIMIT to TopN + RunOptimizer(OptimizerType::TOP_N, [&]() { + TopN topn; + plan = topn.Optimize(std::move(plan)); + }); + // creates projection maps so unused columns are projected out early RunOptimizer(OptimizerType::COLUMN_LIFETIME, [&]() { ColumnLifetimeAnalyzer column_lifetime(true); @@ -179,12 +185,6 @@ unique_ptr Optimizer::Optimize(unique_ptr plan column_lifetime.VisitOperator(*plan); }); - // transform ORDER BY + LIMIT to TopN - RunOptimizer(OptimizerType::TOP_N, [&]() { - TopN topn; - plan = topn.Optimize(std::move(plan)); - }); - // apply simple expression heuristics to get an initial reordering RunOptimizer(OptimizerType::REORDER_FILTER, [&]() { ExpressionHeuristics expression_heuristics(*this); diff --git a/test/optimizer/arithmetic_simplification.test b/test/optimizer/arithmetic_simplification.test index f55e0031a6d1..f3bdd71f9d9c 100644 --- a/test/optimizer/arithmetic_simplification.test +++ b/test/optimizer/arithmetic_simplification.test @@ -1,48 +1,32 @@ # name: test/optimizer/arithmetic_simplification.test -# description: Arithmetic simplification test +# description: topN # group: [optimizer] statement ok -CREATE TABLE test(X INTEGER); +attach 'appian.duckdb' as appian; statement ok -PRAGMA explain_output = OPTIMIZED_ONLY; - -# verify that nop arithmetic is flattened -query I nosort xnorm -EXPLAIN SELECT X FROM test ----- - -query I nosort xnorm -EXPLAIN SELECT X+0 FROM test ----- - -query I nosort xnorm -EXPLAIN SELECT 0+X FROM test ----- - -query I nosort xnorm -EXPLAIN SELECT X-0 FROM test ----- - -query I nosort xnorm -EXPLAIN SELECT X*1 FROM test ----- - -query I nosort xnorm -EXPLAIN SELECT 1*X FROM test ----- - -query I nosort xnorm -EXPLAIN SELECT X//1 FROM test ----- - -# division by zero results in a NULL -query I nosort xnull -EXPLAIN SELECT NULL FROM test ----- - -query I nosort xnull -EXPLAIN SELECT X//0 FROM test ----- +use appian; +statement ok +WITH CTE AS ( + SELECT J1P, CUSTOMER_PRIORITY, CUSTOMER_ID FROM CUSTOMERVIEW + LEFT JOIN ( + SELECT ORDER_CUSTOMERID, SUM(ORDERITEMVIEW.ORDERITEM_QUANTITY) AS J1P FROM ORDERVIEW + LEFT JOIN ORDERITEMVIEW ON (ORDERVIEW.ORDER_ID = ORDERITEMVIEW.ORDERITEM_ORDERID) + WHERE (ORDERVIEW.ORDER_ISEXPEDITEDSHIPPED IS TRUE) + GROUP BY ORDERVIEW.ORDER_CUSTOMERID + ) AS J1J ON (J1J.ORDER_CUSTOMERID = CUSTOMERVIEW.CUSTOMER_ID) + ORDER BY CUSTOMER_PRIORITY ASC + LIMIT 50 OFFSET 50 +) SELECT J1P, Q2P, Q3P FROM CTE +LEFT JOIN ( + SELECT ORDER_CUSTOMERID FROM ORDERVIEW +) AS Q1J ON (Q1J.ORDER_CUSTOMERID = CTE.CUSTOMER_ID) +LEFT JOIN ( + SELECT CREDITCARD_CUSTOMERID AS Q2P FROM CREDITCARDVIEW +) AS Q2J ON (Q2J.Q2P = CTE.CUSTOMER_ID) +LEFT JOIN ( + SELECT ORDER_CUSTOMERID Q3P FROM ORDERVIEW + LEFT JOIN ORDERITEMVIEW ON ORDERVIEW.ORDER_ID = ORDERITEM_ORDERID +) AS Q3J ON (Q3J.Q3P = CTE.CUSTOMER_ID); diff --git a/test/optimizer/topn/complex_top_n.test b/test/optimizer/topn/complex_top_n.test new file mode 100644 index 000000000000..db3d7556ca69 --- /dev/null +++ b/test/optimizer/topn/complex_top_n.test @@ -0,0 +1,43 @@ +# name: test/optimizer/topn/complex_top_n.test +# description: topN +# group: [optimizer] + +statement ok +SELECT SETSEED(0.42); + +statement ok +create table CUSTOMERVIEW as select range customer_id, range*random()::INT % 3 as customer_priority from range(1000, 2000); + +statement ok +create table OrderView as select range order_id, ((random()*2::INT)%2)::BOOL order_isExpeditedShipped, range + (random() * 3)::INT order_customerId from range(1000, 2000); + +statement ok +create table OrderItemView as select random()*25 orderItem_quantity, range orderItem_orderId from range(1000, 2000); + +statement ok +create table CREDITCARDVIEW as select range CREDITCARD_CUSTOMERID from range(1000, 2000); + +query III +WITH CTE AS ( + SELECT J1P, CUSTOMER_PRIORITY, CUSTOMER_ID FROM CUSTOMERVIEW + LEFT JOIN ( + SELECT ORDER_CUSTOMERID, SUM(ORDERITEMVIEW.ORDERITEM_QUANTITY) AS J1P FROM ORDERVIEW + LEFT JOIN ORDERITEMVIEW ON (ORDERVIEW.ORDER_ID = ORDERITEMVIEW.ORDERITEM_ORDERID) + WHERE (ORDERVIEW.ORDER_ISEXPEDITEDSHIPPED IS TRUE) + GROUP BY ORDERVIEW.ORDER_CUSTOMERID + ) AS J1J ON (J1J.ORDER_CUSTOMERID = CUSTOMERVIEW.CUSTOMER_ID) + ORDER BY CUSTOMER_PRIORITY ASC + LIMIT 50 OFFSET 50 +) SELECT J1P, Q2P, Q3P FROM CTE +LEFT JOIN ( + SELECT ORDER_CUSTOMERID FROM ORDERVIEW +) AS Q1J ON (Q1J.ORDER_CUSTOMERID = CTE.CUSTOMER_ID) +LEFT JOIN ( + SELECT CREDITCARD_CUSTOMERID AS Q2P FROM CREDITCARDVIEW +) AS Q2J ON (Q2J.Q2P = CTE.CUSTOMER_ID) +LEFT JOIN ( + SELECT ORDER_CUSTOMERID Q3P FROM ORDERVIEW + LEFT JOIN ORDERITEMVIEW ON ORDERVIEW.ORDER_ID = ORDERITEM_ORDERID +) AS Q3J ON (Q3J.Q3P = CTE.CUSTOMER_ID); +---- +432 values hashing to c51b3f7dd78a68c95de3f44866394cfb diff --git a/test/optimizer/topn_optimizer.test b/test/optimizer/topn/topn_optimizer.test similarity index 94% rename from test/optimizer/topn_optimizer.test rename to test/optimizer/topn/topn_optimizer.test index ceb042b6ef49..e899e1c7e974 100644 --- a/test/optimizer/topn_optimizer.test +++ b/test/optimizer/topn/topn_optimizer.test @@ -1,4 +1,4 @@ -# name: test/optimizer/topn_optimizer.test +# name: test/optimizer/topn/opn_optimizer.test # description: Test Top N optimization # group: [optimizer] From df62014929d7de833d6b3c2e815bb679d0fed04f Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Wed, 10 Apr 2024 14:20:59 +0200 Subject: [PATCH 108/147] make format-fix, and fix old test case --- test/optimizer/arithmetic_simplification.test | 66 ++++++++++++------- test/optimizer/topn/complex_top_n.test | 4 +- test/optimizer/topn/topn_optimizer.test | 4 +- 3 files changed, 45 insertions(+), 29 deletions(-) diff --git a/test/optimizer/arithmetic_simplification.test b/test/optimizer/arithmetic_simplification.test index f3bdd71f9d9c..f55e0031a6d1 100644 --- a/test/optimizer/arithmetic_simplification.test +++ b/test/optimizer/arithmetic_simplification.test @@ -1,32 +1,48 @@ # name: test/optimizer/arithmetic_simplification.test -# description: topN +# description: Arithmetic simplification test # group: [optimizer] statement ok -attach 'appian.duckdb' as appian; +CREATE TABLE test(X INTEGER); statement ok -use appian; +PRAGMA explain_output = OPTIMIZED_ONLY; + +# verify that nop arithmetic is flattened +query I nosort xnorm +EXPLAIN SELECT X FROM test +---- + +query I nosort xnorm +EXPLAIN SELECT X+0 FROM test +---- + +query I nosort xnorm +EXPLAIN SELECT 0+X FROM test +---- + +query I nosort xnorm +EXPLAIN SELECT X-0 FROM test +---- + +query I nosort xnorm +EXPLAIN SELECT X*1 FROM test +---- + +query I nosort xnorm +EXPLAIN SELECT 1*X FROM test +---- + +query I nosort xnorm +EXPLAIN SELECT X//1 FROM test +---- + +# division by zero results in a NULL +query I nosort xnull +EXPLAIN SELECT NULL FROM test +---- + +query I nosort xnull +EXPLAIN SELECT X//0 FROM test +---- -statement ok -WITH CTE AS ( - SELECT J1P, CUSTOMER_PRIORITY, CUSTOMER_ID FROM CUSTOMERVIEW - LEFT JOIN ( - SELECT ORDER_CUSTOMERID, SUM(ORDERITEMVIEW.ORDERITEM_QUANTITY) AS J1P FROM ORDERVIEW - LEFT JOIN ORDERITEMVIEW ON (ORDERVIEW.ORDER_ID = ORDERITEMVIEW.ORDERITEM_ORDERID) - WHERE (ORDERVIEW.ORDER_ISEXPEDITEDSHIPPED IS TRUE) - GROUP BY ORDERVIEW.ORDER_CUSTOMERID - ) AS J1J ON (J1J.ORDER_CUSTOMERID = CUSTOMERVIEW.CUSTOMER_ID) - ORDER BY CUSTOMER_PRIORITY ASC - LIMIT 50 OFFSET 50 -) SELECT J1P, Q2P, Q3P FROM CTE -LEFT JOIN ( - SELECT ORDER_CUSTOMERID FROM ORDERVIEW -) AS Q1J ON (Q1J.ORDER_CUSTOMERID = CTE.CUSTOMER_ID) -LEFT JOIN ( - SELECT CREDITCARD_CUSTOMERID AS Q2P FROM CREDITCARDVIEW -) AS Q2J ON (Q2J.Q2P = CTE.CUSTOMER_ID) -LEFT JOIN ( - SELECT ORDER_CUSTOMERID Q3P FROM ORDERVIEW - LEFT JOIN ORDERITEMVIEW ON ORDERVIEW.ORDER_ID = ORDERITEM_ORDERID -) AS Q3J ON (Q3J.Q3P = CTE.CUSTOMER_ID); diff --git a/test/optimizer/topn/complex_top_n.test b/test/optimizer/topn/complex_top_n.test index db3d7556ca69..a26f364ae06c 100644 --- a/test/optimizer/topn/complex_top_n.test +++ b/test/optimizer/topn/complex_top_n.test @@ -1,6 +1,6 @@ # name: test/optimizer/topn/complex_top_n.test -# description: topN -# group: [optimizer] +# description: topN +# group: [topn] statement ok SELECT SETSEED(0.42); diff --git a/test/optimizer/topn/topn_optimizer.test b/test/optimizer/topn/topn_optimizer.test index e899e1c7e974..96e0c9ec1c60 100644 --- a/test/optimizer/topn/topn_optimizer.test +++ b/test/optimizer/topn/topn_optimizer.test @@ -1,6 +1,6 @@ -# name: test/optimizer/topn/opn_optimizer.test +# name: test/optimizer/topn/topn_optimizer.test # description: Test Top N optimization -# group: [optimizer] +# group: [topn] statement ok CREATE TABLE integers(i INTEGER, j INTEGER) From 7a5678f9594420d8c1f30dc98824299840cf6652 Mon Sep 17 00:00:00 2001 From: Tom Ebergen Date: Wed, 10 Apr 2024 14:46:07 +0200 Subject: [PATCH 109/147] check for topn in test --- test/optimizer/topn/complex_top_n.test | 35 ++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/test/optimizer/topn/complex_top_n.test b/test/optimizer/topn/complex_top_n.test index a26f364ae06c..04d51fe117b3 100644 --- a/test/optimizer/topn/complex_top_n.test +++ b/test/optimizer/topn/complex_top_n.test @@ -6,16 +6,16 @@ statement ok SELECT SETSEED(0.42); statement ok -create table CUSTOMERVIEW as select range customer_id, range*random()::INT % 3 as customer_priority from range(1000, 2000); +create table CUSTOMERVIEW as select range customer_id, range*random()::INT % 3 as customer_priority from range(1000, 4000); statement ok -create table OrderView as select range order_id, ((random()*2::INT)%2)::BOOL order_isExpeditedShipped, range + (random() * 3)::INT order_customerId from range(1000, 2000); +create table OrderView as select range order_id, ((random()*2::INT)%2)::BOOL order_isExpeditedShipped, range + (random() * 3)::INT order_customerId from range(1000, 4000); statement ok -create table OrderItemView as select random()*25 orderItem_quantity, range orderItem_orderId from range(1000, 2000); +create table OrderItemView as select random()*25 orderItem_quantity, range orderItem_orderId from range(1000, 4000); statement ok -create table CREDITCARDVIEW as select range CREDITCARD_CUSTOMERID from range(1000, 2000); +create table CREDITCARDVIEW as select range CREDITCARD_CUSTOMERID from range(1000, 4000); query III WITH CTE AS ( @@ -40,4 +40,29 @@ LEFT JOIN ( LEFT JOIN ORDERITEMVIEW ON ORDERVIEW.ORDER_ID = ORDERITEM_ORDERID ) AS Q3J ON (Q3J.Q3P = CTE.CUSTOMER_ID); ---- -432 values hashing to c51b3f7dd78a68c95de3f44866394cfb +423 values hashing to 88bbd750b435b7616e6596774a8d5689 + +query II +explain WITH CTE AS ( + SELECT J1P, CUSTOMER_PRIORITY, CUSTOMER_ID FROM CUSTOMERVIEW + LEFT JOIN ( + SELECT ORDER_CUSTOMERID, SUM(ORDERITEMVIEW.ORDERITEM_QUANTITY) AS J1P FROM ORDERVIEW + LEFT JOIN ORDERITEMVIEW ON (ORDERVIEW.ORDER_ID = ORDERITEMVIEW.ORDERITEM_ORDERID) + WHERE (ORDERVIEW.ORDER_ISEXPEDITEDSHIPPED IS TRUE) + GROUP BY ORDERVIEW.ORDER_CUSTOMERID + ) AS J1J ON (J1J.ORDER_CUSTOMERID = CUSTOMERVIEW.CUSTOMER_ID) + ORDER BY CUSTOMER_PRIORITY ASC + LIMIT 50 OFFSET 50 +) SELECT J1P, Q2P, Q3P FROM CTE +LEFT JOIN ( + SELECT ORDER_CUSTOMERID FROM ORDERVIEW +) AS Q1J ON (Q1J.ORDER_CUSTOMERID = CTE.CUSTOMER_ID) +LEFT JOIN ( + SELECT CREDITCARD_CUSTOMERID AS Q2P FROM CREDITCARDVIEW +) AS Q2J ON (Q2J.Q2P = CTE.CUSTOMER_ID) +LEFT JOIN ( + SELECT ORDER_CUSTOMERID Q3P FROM ORDERVIEW + LEFT JOIN ORDERITEMVIEW ON ORDERVIEW.ORDER_ID = ORDERITEM_ORDERID +) AS Q3J ON (Q3J.Q3P = CTE.CUSTOMER_ID); +---- +physical_plan :.*TOP_N.* From e8cbb9f5f4e40ad461697c6e57853392665df483 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Wed, 10 Apr 2024 15:02:52 +0200 Subject: [PATCH 110/147] Bump postgres to latest main --- .github/config/out_of_tree_extensions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake index 5d1795ef1168..e766c56004f1 100644 --- a/.github/config/out_of_tree_extensions.cmake +++ b/.github/config/out_of_tree_extensions.cmake @@ -67,7 +67,7 @@ if (NOT MINGW) duckdb_extension_load(postgres_scanner DONT_LINK GIT_URL https://github.com/duckdb/postgres_scanner - GIT_TAG 375710fd22a35107b2c28e744f787e1a93a99998 + GIT_TAG 96206f41d5ca7015920a66b54e936c986fe0b0f8 ) endif() From 432b97983ef6eba4d60201cd469dbb5de385ddbf Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Wed, 10 Apr 2024 15:04:49 +0200 Subject: [PATCH 111/147] Enable arrow build also for windows (already enabled in nightly) --- .github/config/out_of_tree_extensions.cmake | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/.github/config/out_of_tree_extensions.cmake b/.github/config/out_of_tree_extensions.cmake index e766c56004f1..0f919af9b1e8 100644 --- a/.github/config/out_of_tree_extensions.cmake +++ b/.github/config/out_of_tree_extensions.cmake @@ -16,13 +16,11 @@ # VCPKG_TARGET_TRIPLET=arm64-osx ################# ARROW -if (NOT WIN32) - duckdb_extension_load(arrow - LOAD_TESTS DONT_LINK - GIT_URL https://github.com/duckdb/arrow - GIT_TAG 9e10240da11f61ea7fbfe3fc9988ffe672ccd40f - ) -endif() +duckdb_extension_load(arrow + LOAD_TESTS DONT_LINK + GIT_URL https://github.com/duckdb/arrow + GIT_TAG 9e10240da11f61ea7fbfe3fc9988ffe672ccd40f + ) ################## AWS if (NOT MINGW) From 07028017e5153a003c0fa655822c81f44e0c7e82 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Wed, 10 Apr 2024 22:00:57 +0200 Subject: [PATCH 112/147] Add postgres's new extension_entries --- src/include/duckdb/main/extension_entries.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/include/duckdb/main/extension_entries.hpp b/src/include/duckdb/main/extension_entries.hpp index 3b84b1544e3c..df6741df05a6 100644 --- a/src/include/duckdb/main/extension_entries.hpp +++ b/src/include/duckdb/main/extension_entries.hpp @@ -88,6 +88,7 @@ static constexpr ExtensionFunctionEntry EXTENSION_FUNCTIONS[] = { {"pg_clear_cache", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"pg_timezone_names", "icu", CatalogType::TABLE_FUNCTION_ENTRY}, {"postgres_attach", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, + {"postgres_execute", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"postgres_query", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"postgres_scan", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, {"postgres_scan_pushdown", "postgres_scanner", CatalogType::TABLE_FUNCTION_ENTRY}, @@ -257,6 +258,7 @@ static constexpr ExtensionEntry EXTENSION_SETTINGS[] = { {"pg_experimental_filter_pushdown", "postgres_scanner"}, {"pg_pages_per_task", "postgres_scanner"}, {"pg_use_binary_copy", "postgres_scanner"}, + {"pg_use_ctid_scan", "postgres_scanner"}, {"s3_access_key_id", "httpfs"}, {"s3_endpoint", "httpfs"}, {"s3_region", "httpfs"}, From eb0ad4b8436e0074f3a54987e7e03b5ca4d9fa31 Mon Sep 17 00:00:00 2001 From: Guen Prawiroatmodjo Date: Wed, 10 Apr 2024 18:28:51 -0700 Subject: [PATCH 113/147] SQL_TYPE_TIMESTAMP should use SQL_TYPE_TIMESTAMP instead of SQL_DATETIME --- tools/odbc/src/api_info.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/odbc/src/api_info.cpp b/tools/odbc/src/api_info.cpp index 309479dfa229..3f2918caddd4 100644 --- a/tools/odbc/src/api_info.cpp +++ b/tools/odbc/src/api_info.cpp @@ -98,7 +98,7 @@ const vector ApiInfo::ODBC_SUPPORTED_SQL_TYPES = { { "'BIGINT'", SQL_BIGINT, 19, "NULL", "NULL", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, SQL_FALSE, SQL_FALSE, SQL_FALSE, "NULL", 0, 0, SQL_BIGINT, -1, 2, -1}, { "'DATE'", SQL_TYPE_DATE, 10, "''''", "''''", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", -1, -1, SQL_DATETIME, SQL_CODE_DATE, -1, -1}, { "'TIME'", SQL_TYPE_TIME, 8, "''''", "''''", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", 0, 0, SQL_DATETIME, SQL_CODE_TIME, -1, -1}, -{ "'TIMESTAMP'", SQL_TYPE_TIMESTAMP, 26, "''''", "''''", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", 0, 0, SQL_DATETIME, SQL_CODE_TIMESTAMP, -1, -1}, +{ "'TIMESTAMP'", SQL_TYPE_TIMESTAMP, 26, "''''", "''''", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", 0, 0, SQL_TYPE_TIMESTAMP, SQL_CODE_TIMESTAMP, -1, -1}, { "'DECIMAL'", SQL_DECIMAL, 38, "''''", "''''", "'precision,scale'", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", 0, 38, SQL_DECIMAL, -1, 10, -1}, { "'NUMERIC'", SQL_NUMERIC, 38, "''''", "''''", "'precision,scale'", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", 0, 38, SQL_NUMERIC, -1, 10, -1}, { "'FLOAT'", SQL_FLOAT, 24, "NULL", "NULL", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, SQL_FALSE, SQL_FALSE, SQL_FALSE, "NULL", 0, 0, SQL_FLOAT, -1, 2, -1}, From 16d451f312b45da66c40e605f1c6e421bcb39ca7 Mon Sep 17 00:00:00 2001 From: Guen Prawiroatmodjo Date: Wed, 10 Apr 2024 19:21:04 -0700 Subject: [PATCH 114/147] SQL_TYPE_DATE and TIME should use SQL_TYPE_DATE and TIME instead of SQL_DATETIME --- tools/odbc/src/api_info.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/odbc/src/api_info.cpp b/tools/odbc/src/api_info.cpp index 3f2918caddd4..2753b7928eaf 100644 --- a/tools/odbc/src/api_info.cpp +++ b/tools/odbc/src/api_info.cpp @@ -96,8 +96,8 @@ const vector ApiInfo::ODBC_SUPPORTED_SQL_TYPES = { { "'SMALLINT'", SQL_SMALLINT, 5, "NULL", "NULL", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, SQL_FALSE, SQL_FALSE, SQL_FALSE, "NULL", 0, 0, SQL_SMALLINT, -1, 2, -1}, { "'INTEGER'", SQL_INTEGER, 10, "NULL", "NULL", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, SQL_FALSE, SQL_FALSE, SQL_FALSE, "NULL", 0, 0, SQL_INTEGER, -1, 2, -1}, { "'BIGINT'", SQL_BIGINT, 19, "NULL", "NULL", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, SQL_FALSE, SQL_FALSE, SQL_FALSE, "NULL", 0, 0, SQL_BIGINT, -1, 2, -1}, -{ "'DATE'", SQL_TYPE_DATE, 10, "''''", "''''", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", -1, -1, SQL_DATETIME, SQL_CODE_DATE, -1, -1}, -{ "'TIME'", SQL_TYPE_TIME, 8, "''''", "''''", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", 0, 0, SQL_DATETIME, SQL_CODE_TIME, -1, -1}, +{ "'DATE'", SQL_TYPE_DATE, 10, "''''", "''''", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", -1, -1, SQL_TYPE_DATE, SQL_CODE_DATE, -1, -1}, +{ "'TIME'", SQL_TYPE_TIME, 8, "''''", "''''", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", 0, 0, SQL_TYPE_TIME, SQL_CODE_TIME, -1, -1}, { "'TIMESTAMP'", SQL_TYPE_TIMESTAMP, 26, "''''", "''''", "NULL", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", 0, 0, SQL_TYPE_TIMESTAMP, SQL_CODE_TIMESTAMP, -1, -1}, { "'DECIMAL'", SQL_DECIMAL, 38, "''''", "''''", "'precision,scale'", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", 0, 38, SQL_DECIMAL, -1, 10, -1}, { "'NUMERIC'", SQL_NUMERIC, 38, "''''", "''''", "'precision,scale'", SQL_NULLABLE, SQL_FALSE, SQL_PRED_BASIC, -1, SQL_FALSE, SQL_FALSE, "NULL", 0, 38, SQL_NUMERIC, -1, 10, -1}, From 1c1636ab4877949724440d302a2c3bf32f5cf3d6 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 10:50:45 +0200 Subject: [PATCH 115/147] Pragma assignment with multiple parameters - ParserException instead of InternalException --- src/parser/transform/statement/transform_pragma.cpp | 2 +- ...sterfuzz-testcase-minimized-parse_fuzz_test-5041566491475968 | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5041566491475968 diff --git a/src/parser/transform/statement/transform_pragma.cpp b/src/parser/transform/statement/transform_pragma.cpp index d55d6a2bb8bc..63a5c35d4559 100644 --- a/src/parser/transform/statement/transform_pragma.cpp +++ b/src/parser/transform/statement/transform_pragma.cpp @@ -49,7 +49,7 @@ unique_ptr Transformer::TransformPragma(duckdb_libpgquery::PGPragm break; case duckdb_libpgquery::PG_PRAGMA_TYPE_ASSIGNMENT: if (info.parameters.size() != 1) { - throw InternalException("PRAGMA statement with assignment should contain exactly one parameter"); + throw ParserException("PRAGMA statement with assignment should contain exactly one parameter"); } if (!info.named_parameters.empty()) { throw InternalException("PRAGMA statement with assignment cannot have named parameters"); diff --git a/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5041566491475968 b/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5041566491475968 new file mode 100644 index 000000000000..64f0ff9d3f63 --- /dev/null +++ b/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5041566491475968 @@ -0,0 +1 @@ +pragma ÿ=2,a \ No newline at end of file From e92cf9550919961b9f81c5c595d3543b3cd6d6f2 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 10:54:35 +0200 Subject: [PATCH 116/147] Parameters in SET not supported (yet?) --- src/planner/binder/statement/bind_set.cpp | 3 +++ ...terfuzz-testcase-minimized-parse_fuzz_test-5103220313423872 | 1 + 2 files changed, 4 insertions(+) create mode 100644 test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5103220313423872 diff --git a/src/planner/binder/statement/bind_set.cpp b/src/planner/binder/statement/bind_set.cpp index 15f33d31faea..77c23e22ffd9 100644 --- a/src/planner/binder/statement/bind_set.cpp +++ b/src/planner/binder/statement/bind_set.cpp @@ -15,6 +15,9 @@ BoundStatement Binder::Bind(SetVariableStatement &stmt) { // evaluate the scalar value ConstantBinder default_binder(*this, context, "SET value"); auto bound_value = default_binder.Bind(stmt.value); + if (bound_value->HasParameter()) { + throw NotImplementedException("SET statements cannot have parameters"); + } auto value = ExpressionExecutor::EvaluateScalar(context, *bound_value, true); result.plan = make_uniq(stmt.name, std::move(value), stmt.scope); diff --git a/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5103220313423872 b/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5103220313423872 new file mode 100644 index 000000000000..9949d4475074 --- /dev/null +++ b/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5103220313423872 @@ -0,0 +1 @@ +set ÿ=?=? \ No newline at end of file From 3862ec717912da20785e1fa4698a4f1945e9ec14 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 11:06:28 +0200 Subject: [PATCH 117/147] Correctly check for overflows in hugeint sum/avg --- .../aggregate/algebraic/avg.cpp | 4 +-- .../aggregate/distributive/sum.cpp | 4 +-- .../core_functions/aggregate/sum_helpers.hpp | 14 +++++++++- ...minimized-parse_fuzz_test-5145260887965696 | 1 + .../types/hugeint/hugeint_sum_overflow.test | 26 +++++++++++++++++++ 5 files changed, 44 insertions(+), 5 deletions(-) create mode 100644 test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5145260887965696 create mode 100644 test/sql/types/hugeint/hugeint_sum_overflow.test diff --git a/src/core_functions/aggregate/algebraic/avg.cpp b/src/core_functions/aggregate/algebraic/avg.cpp index 9cebfc4a0b02..d00e743ff6ca 100644 --- a/src/core_functions/aggregate/algebraic/avg.cpp +++ b/src/core_functions/aggregate/algebraic/avg.cpp @@ -93,7 +93,7 @@ struct IntegerAverageOperation : public BaseSumOperation { +struct IntegerAverageOperationHugeint : public BaseSumOperation { template static void Finalize(STATE &state, T &target, AggregateFinalizeData &finalize_data) { if (state.count == 0) { @@ -105,7 +105,7 @@ struct IntegerAverageOperationHugeint : public BaseSumOperation { +struct HugeintAverageOperation : public BaseSumOperation { template static void Finalize(STATE &state, T &target, AggregateFinalizeData &finalize_data) { if (state.count == 0) { diff --git a/src/core_functions/aggregate/distributive/sum.cpp b/src/core_functions/aggregate/distributive/sum.cpp index 9f243869ad16..0d855297e91d 100644 --- a/src/core_functions/aggregate/distributive/sum.cpp +++ b/src/core_functions/aggregate/distributive/sum.cpp @@ -32,7 +32,7 @@ struct IntegerSumOperation : public BaseSumOperation { +struct SumToHugeintOperation : public BaseSumOperation { template static void Finalize(STATE &state, T &target, AggregateFinalizeData &finalize_data) { if (!state.isset) { @@ -58,7 +58,7 @@ struct DoubleSumOperation : public BaseSumOperation; using KahanSumOperation = DoubleSumOperation; -struct HugeintSumOperation : public BaseSumOperation { +struct HugeintSumOperation : public BaseSumOperation { template static void Finalize(STATE &state, T &target, AggregateFinalizeData &finalize_data) { if (!state.isset) { diff --git a/src/include/duckdb/core_functions/aggregate/sum_helpers.hpp b/src/include/duckdb/core_functions/aggregate/sum_helpers.hpp index 45f533a7f8c4..355701bdf25d 100644 --- a/src/include/duckdb/core_functions/aggregate/sum_helpers.hpp +++ b/src/include/duckdb/core_functions/aggregate/sum_helpers.hpp @@ -65,6 +65,18 @@ struct RegularAdd { } }; +struct HugeintAdd { + template + static void AddNumber(STATE &state, T input) { + state.value = Hugeint::Add(state.value, input); + } + + template + static void AddConstant(STATE &state, T input, idx_t count) { + AddNumber(state, Hugeint::Multiply(input, count)); + } +}; + struct KahanAdd { template static void AddNumber(STATE &state, T input) { @@ -77,7 +89,7 @@ struct KahanAdd { } }; -struct HugeintAdd { +struct AddToHugeint { static void AddValue(hugeint_t &result, uint64_t value, int positive) { // integer summation taken from Tim Gubner et al. - Efficient Query Processing // with Optimistically Compressed Hash Tables & Strings in the USSR diff --git a/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5145260887965696 b/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5145260887965696 new file mode 100644 index 000000000000..f6e76e22cab4 --- /dev/null +++ b/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5145260887965696 @@ -0,0 +1 @@ +Summarize select-170141183460469231731687303715884105728 \ No newline at end of file diff --git a/test/sql/types/hugeint/hugeint_sum_overflow.test b/test/sql/types/hugeint/hugeint_sum_overflow.test new file mode 100644 index 000000000000..84a57e364474 --- /dev/null +++ b/test/sql/types/hugeint/hugeint_sum_overflow.test @@ -0,0 +1,26 @@ +# name: test/sql/types/hugeint/hugeint_sum_overflow.test +# description: Test hugeint sum overflow +# group: [hugeint] + +statement ok +PRAGMA enable_verification + +statement error +SELECT SUM(170141183460469231731687303715884105727) FROM range(10); +---- +Overflow + +statement error +SELECT SUM(x) FROM (VALUES (170141183460469231731687303715884105727), (170141183460469231731687303715884105727)) t(x) +---- +Overflow + +statement error +SELECT AVG(170141183460469231731687303715884105727) FROM range(10); +---- +Overflow + +statement error +SELECT AVG(x) FROM (VALUES (170141183460469231731687303715884105727), (170141183460469231731687303715884105727)) t(x) +---- +Overflow From ae79db86417afb7766e7df40ea04cfce0726ae01 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 11:17:27 +0200 Subject: [PATCH 118/147] Push correct target scale in decimal rounding --- src/core_functions/scalar/math/numeric.cpp | 2 +- ...sterfuzz-testcase-minimized-parse_fuzz_test-4954980899422208 | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) create mode 100644 test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-4954980899422208 diff --git a/src/core_functions/scalar/math/numeric.cpp b/src/core_functions/scalar/math/numeric.cpp index d47887f93e6a..2594b2f96b54 100644 --- a/src/core_functions/scalar/math/numeric.cpp +++ b/src/core_functions/scalar/math/numeric.cpp @@ -675,7 +675,7 @@ unique_ptr BindDecimalRoundPrecision(ClientContext &context, Scala } bound_function.arguments[0] = decimal_type; bound_function.return_type = LogicalType::DECIMAL(width, target_scale); - return make_uniq(round_value); + return make_uniq(target_scale); } ScalarFunctionSet RoundFun::GetFunctions() { diff --git a/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-4954980899422208 b/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-4954980899422208 new file mode 100644 index 000000000000..5af47c2af954 --- /dev/null +++ b/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-4954980899422208 @@ -0,0 +1 @@ +sElecT round(.3333333333333333,~2) \ No newline at end of file From a106c42632615d2ee30a62a4393a62f6a22aa2bb Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 11:18:00 +0200 Subject: [PATCH 119/147] Add all new ossfuzz cases --- ...usterfuzz-testcase-minimized-parse_fuzz_test-5177383552352256 | 1 + 1 file changed, 1 insertion(+) create mode 100644 test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5177383552352256 diff --git a/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5177383552352256 b/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5177383552352256 new file mode 100644 index 000000000000..1eba605f0071 --- /dev/null +++ b/test/ossfuzz/cases/clusterfuzz-testcase-minimized-parse_fuzz_test-5177383552352256 @@ -0,0 +1 @@ +SElECT-+-170141183460469231731687303715884105728 \ No newline at end of file From 0db2708230604a637f01929f0d2bd7801dd4a651 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 11:47:17 +0200 Subject: [PATCH 120/147] Fix for decimal round - use early-out correctly --- src/core_functions/scalar/math/numeric.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core_functions/scalar/math/numeric.cpp b/src/core_functions/scalar/math/numeric.cpp index 2594b2f96b54..40a2f520d48a 100644 --- a/src/core_functions/scalar/math/numeric.cpp +++ b/src/core_functions/scalar/math/numeric.cpp @@ -575,7 +575,7 @@ static void DecimalRoundNegativePrecisionFunction(DataChunk &input, ExpressionSt auto &info = func_expr.bind_info->Cast(); auto source_scale = DecimalType::GetScale(func_expr.children[0]->return_type); auto width = DecimalType::GetWidth(func_expr.children[0]->return_type); - if (info.target_scale <= -int32_t(width)) { + if (info.target_scale <= -int32_t(width - source_scale)) { // scale too big for width result.SetVectorType(VectorType::CONSTANT_VECTOR); result.SetValue(0, Value::INTEGER(0)); @@ -675,7 +675,7 @@ unique_ptr BindDecimalRoundPrecision(ClientContext &context, Scala } bound_function.arguments[0] = decimal_type; bound_function.return_type = LogicalType::DECIMAL(width, target_scale); - return make_uniq(target_scale); + return make_uniq(round_value); } ScalarFunctionSet RoundFun::GetFunctions() { From f8238ab335c65e49d7e709c4e500589102cf1c72 Mon Sep 17 00:00:00 2001 From: Tmonster Date: Thu, 11 Apr 2024 11:54:05 +0200 Subject: [PATCH 121/147] add require skip reload --- test/optimizer/topn/complex_top_n.test | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/optimizer/topn/complex_top_n.test b/test/optimizer/topn/complex_top_n.test index 04d51fe117b3..8cc825736293 100644 --- a/test/optimizer/topn/complex_top_n.test +++ b/test/optimizer/topn/complex_top_n.test @@ -2,6 +2,8 @@ # description: topN # group: [topn] +require skip_reload + statement ok SELECT SETSEED(0.42); From 7099f6646272022f170909f25db383999dd410a6 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 11 Apr 2024 12:09:00 +0200 Subject: [PATCH 122/147] DynamicCastCheck to be done on const T --- src/include/duckdb/common/helper.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/include/duckdb/common/helper.hpp b/src/include/duckdb/common/helper.hpp index b19b85f6d851..d4c07cc47091 100644 --- a/src/include/duckdb/common/helper.hpp +++ b/src/include/duckdb/common/helper.hpp @@ -215,9 +215,9 @@ bool RefersToSameObject(const T &a, const T &b) { } template -void DynamicCastCheck(SRC *source) { +void DynamicCastCheck(const SRC *source) { #ifndef __APPLE__ - D_ASSERT(dynamic_cast(source)); + D_ASSERT(dynamic_cast(source)); #endif } From d110d7992e054130787544773bd29cd99f59a4c6 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 11 Apr 2024 12:09:59 +0200 Subject: [PATCH 123/147] Use DynamicCastCheck both for const and non-const Casts --- src/include/duckdb/catalog/catalog.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/duckdb/catalog/catalog.hpp b/src/include/duckdb/catalog/catalog.hpp index 871738a975de..31098ba10f6c 100644 --- a/src/include/duckdb/catalog/catalog.hpp +++ b/src/include/duckdb/catalog/catalog.hpp @@ -368,7 +368,7 @@ class Catalog { template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; From df14a380d307244e5e939451a02765c56f057840 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 12:15:42 +0200 Subject: [PATCH 124/147] Out-of-range positional reference --- .../expression_binder/select_bind_state.cpp | 2 +- .../duckfuzz/order_by_positional_reference.test | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 test/fuzzer/duckfuzz/order_by_positional_reference.test diff --git a/src/planner/expression_binder/select_bind_state.cpp b/src/planner/expression_binder/select_bind_state.cpp index 4b0a19cd2e5b..23ada81d0451 100644 --- a/src/planner/expression_binder/select_bind_state.cpp +++ b/src/planner/expression_binder/select_bind_state.cpp @@ -43,7 +43,7 @@ void SelectBindState::AddRegularColumn() { } idx_t SelectBindState::GetFinalIndex(idx_t index) const { - if (expanded_column_indices.empty()) { + if (index >= expanded_column_indices.size()) { return index; } return expanded_column_indices[index]; diff --git a/test/fuzzer/duckfuzz/order_by_positional_reference.test b/test/fuzzer/duckfuzz/order_by_positional_reference.test new file mode 100644 index 000000000000..ec567b9a2536 --- /dev/null +++ b/test/fuzzer/duckfuzz/order_by_positional_reference.test @@ -0,0 +1,14 @@ +# name: test/fuzzer/duckfuzz/order_by_positional_reference.test +# description: Mix GROUP BY ALL and positional references +# group: [duckfuzz] + +statement ok +PRAGMA enable_verification + +statement ok +create table integers(c1 int, c2 int); + +statement error +SELECT c1, c2, NULL FROM integers ORDER BY #10 +---- +term out of range From 22bb67a06e19ed95876b0161f8deff676348dd92 Mon Sep 17 00:00:00 2001 From: Tishj Date: Thu, 11 Apr 2024 12:17:30 +0200 Subject: [PATCH 125/147] make the python linter happy --- scripts/generate_extensions_function.py | 32 +++++++++++++++---------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/scripts/generate_extensions_function.py b/scripts/generate_extensions_function.py index dff1dfc9d640..c95fa95f4655 100644 --- a/scripts/generate_extensions_function.py +++ b/scripts/generate_extensions_function.py @@ -80,8 +80,9 @@ class ExtensionFunction(NamedTuple): name: str type: CatalogType + @staticmethod def create_map(input: List[Tuple[str, str, str]]) -> Dict[Function, "ExtensionFunction"]: - output: Dict[str, "ExtensionFunction"] = {} + output: Dict[Function, "ExtensionFunction"] = {} for x in input: key = Function(x[0], catalog_type_from_type(x[2])) output[key] = ExtensionFunction(x[1], key.name, key.type) @@ -92,6 +93,7 @@ class ExtensionSetting(NamedTuple): extension: str name: str + @staticmethod def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionSetting"]: output: Dict[str, "ExtensionSetting"] = {} for x in input: @@ -103,6 +105,7 @@ class ExtensionCopyFunction(NamedTuple): extension: str name: str + @staticmethod def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionCopyFunction"]: output: Dict[str, "ExtensionCopyFunction"] = {} for x in input: @@ -114,6 +117,7 @@ class ExtensionType(NamedTuple): extension: str name: str + @staticmethod def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionType"]: output: Dict[str, "ExtensionType"] = {} for x in input: @@ -145,7 +149,7 @@ def get_extension_names() -> List[str]: return extension_names -def get_query(sql_query, load_query): +def get_query(sql_query, load_query) -> list: # Optionally perform a LOAD of an extension # Then perform a SQL query, fetch the output query = f'{DUCKDB_PATH} -csv -unsigned -c "{load_query}{sql_query}" ' @@ -171,7 +175,7 @@ def get_functions(load="") -> Set[Function]: return functions -def get_settings(load=""): +def get_settings(load="") -> Set[str]: GET_SETTINGS_QUERY = """ select distinct name @@ -192,12 +196,12 @@ def __init__(self): self.stored_functions: Dict[str, List[Function]] = { 'substrait': [ - Function("from_substrait", "table"), - Function("get_substrait", "table"), - Function("get_substrait_json", "table"), - Function("from_substrait_json", "table"), + Function("from_substrait", CatalogType.TABLE), + Function("get_substrait", CatalogType.TABLE), + Function("get_substrait_json", CatalogType.TABLE), + Function("from_substrait_json", CatalogType.TABLE), ], - 'arrow': [Function("scan_arrow_ipc", "table"), Function("to_arrow_ipc", "table")], + 'arrow': [Function("scan_arrow_ipc", CatalogType.TABLE), Function("to_arrow_ipc", CatalogType.TABLE)], 'spatial': [], } self.stored_settings: Dict[str, List[str]] = {'substrait': [], 'arrow': [], 'spatial': []} @@ -214,8 +218,8 @@ def add_extension(self, extension_name: str): print(f"Load {extension_name} at {extension_path}") load = f"LOAD '{extension_path}';" - extension_functions = get_functions(load) - extension_settings = get_settings(load) + extension_functions = list(get_functions(load)) + extension_settings = list(get_settings(load)) self.add_settings(extension_name, extension_settings) self.add_functions(extension_name, extension_functions) @@ -237,7 +241,7 @@ def add_settings(self, extension_name: str, settings_list: List[str]): extension_name = extension_name.lower() added_settings: Set[str] = set(settings_list) - self.base_settings - settings_to_add: Dict[str, str] = {} + settings_to_add: Dict[str, ExtensionSetting] = {} for setting in added_settings: setting_name = setting.lower() settings_to_add[setting_name] = ExtensionSetting(extension_name, setting_name) @@ -315,7 +319,7 @@ def get_slice_of_file(var_name, file_str): # Parses the extension_entries.hpp file def parse_extension_entries(file_path): - def parse_contents(input) -> tuple: + def parse_contents(input) -> list: # Split the string by comma and remove any leading or trailing spaces elements = input.split(",") # Strip any leading or trailing spaces and surrounding double quotes from each element @@ -330,24 +334,28 @@ def parse_contents(input) -> tuple: ext_functions_file_blob = get_slice_of_file("EXTENSION_FUNCTIONS", file_blob) res = pattern.findall(ext_functions_file_blob) res = [parse_contents(x) for x in res] + res = [(x[0], x[1], x[2]) for x in res] cur_function_map = ExtensionFunction.create_map(res) # Get the extension settings ext_settings_file_blob = get_slice_of_file("EXTENSION_SETTINGS", file_blob) res = pattern.findall(ext_settings_file_blob) res = [parse_contents(x) for x in res] + res = [(x[0], x[1]) for x in res] cur_settings_map = ExtensionSetting.create_map(res) # Get the extension types ext_copy_functions_blob = get_slice_of_file("EXTENSION_COPY_FUNCTIONS", file_blob) res = pattern.findall(ext_copy_functions_blob) res = [parse_contents(x) for x in res] + res = [(x[0], x[1]) for x in res] cur_copy_functions_map = ExtensionCopyFunction.create_map(res) # Get the extension types ext_types_file_blob = get_slice_of_file("EXTENSION_TYPES", file_blob) res = pattern.findall(ext_types_file_blob) res = [parse_contents(x) for x in res] + res = [(x[0], x[1]) for x in res] cur_types_map = ExtensionType.create_map(res) return { From 04baa831475b777f978d34cdca80529e8d27a869 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 12:25:53 +0200 Subject: [PATCH 126/147] Correctly deal with impossible implicit casts from array -> list --- src/function/cast_rules.cpp | 6 +++++- test/sql/function/array/array_flatten.test | 16 ++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 test/sql/function/array/array_flatten.test diff --git a/src/function/cast_rules.cpp b/src/function/cast_rules.cpp index 3dc1213c8c9b..98ca78b86655 100644 --- a/src/function/cast_rules.cpp +++ b/src/function/cast_rules.cpp @@ -386,8 +386,12 @@ int64_t CastRules::ImplicitCast(const LogicalType &from, const LogicalType &to) } if (from.id() == LogicalTypeId::ARRAY && to.id() == LogicalTypeId::LIST) { // Arrays can be cast to lists for the cost of casting the child type + auto child_cost = ImplicitCast(ArrayType::GetChildType(from), ListType::GetChildType(to)); + if (child_cost < 0) { + return -1; + } // add 1 because we prefer ARRAY->ARRAY casts over ARRAY->LIST casts - return ImplicitCast(ArrayType::GetChildType(from), ListType::GetChildType(to)) + 1; + return child_cost + 1; } if (from.id() == LogicalTypeId::LIST && (to.id() == LogicalTypeId::ARRAY && !ArrayType::IsAnySize(to))) { // Lists can be cast to arrays for the cost of casting the child type, if the target size is known diff --git a/test/sql/function/array/array_flatten.test b/test/sql/function/array/array_flatten.test new file mode 100644 index 000000000000..406fab2db72d --- /dev/null +++ b/test/sql/function/array/array_flatten.test @@ -0,0 +1,16 @@ +# name: test/sql/function/array/array_flatten.test +# description: Test array flatten function +# group: [array] + +statement ok +PRAGMA enable_verification + +statement error +select flatten(['a', 'b', 'c']::varchar[3]); +---- +No function matches the given name and argument types + +query I +select flatten([['a'], ['b'], ['c']]::varchar[1][3]); +---- +[a, b, c] From 79c1c4b682e4e5d1d811c2a37c619984c0ac1776 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 12:33:55 +0200 Subject: [PATCH 127/147] Use ConstantVector::SetNull --- src/include/duckdb/core_functions/lambda_functions.hpp | 3 ++- test/sql/function/list/lambda_constant_null.test | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 test/sql/function/list/lambda_constant_null.test diff --git a/src/include/duckdb/core_functions/lambda_functions.hpp b/src/include/duckdb/core_functions/lambda_functions.hpp index f560bf4ba3c6..624e087dd46a 100644 --- a/src/include/duckdb/core_functions/lambda_functions.hpp +++ b/src/include/duckdb/core_functions/lambda_functions.hpp @@ -88,7 +88,8 @@ class LambdaFunctions { result_validity = &FlatVector::Validity(result); if (list_column.GetType().id() == LogicalTypeId::SQLNULL) { - result_validity->SetInvalid(0); + result.SetVectorType(VectorType::CONSTANT_VECTOR); + ConstantVector::SetNull(result, true); result_is_null = true; return; } diff --git a/test/sql/function/list/lambda_constant_null.test b/test/sql/function/list/lambda_constant_null.test new file mode 100644 index 000000000000..05b5885d44db --- /dev/null +++ b/test/sql/function/list/lambda_constant_null.test @@ -0,0 +1,8 @@ +# name: test/sql/function/list/lambda_constant_null.test +# description: Test constant NULL values in lambdas +# group: [list] + +statement error +select quantile(NULL, filter(NULL, (c103 -> 'babea54a-2261-4b0c-b14b-1d0e9b794e1a'))); +---- +QUANTILE parameter cannot be NULL From 5c9658872056865febe5690acd841f750f8e70b5 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 12:50:57 +0200 Subject: [PATCH 128/147] Shell fuzzer fixes - remove several problematic functions, and implement sqlite3_limit --- tools/shell/shell.c | 205 ------------------ .../sqlite3_api_wrapper.cpp | 25 ++- 2 files changed, 23 insertions(+), 207 deletions(-) diff --git a/tools/shell/shell.c b/tools/shell/shell.c index d228bee5ff98..90d975f8f2fc 100644 --- a/tools/shell/shell.c +++ b/tools/shell/shell.c @@ -1946,167 +1946,6 @@ static void sha3Func( sqlite3_result_blob(context, SHA3Final(&cx), iSize/8, SQLITE_TRANSIENT); } -/* Compute a string using sqlite3_vsnprintf() with a maximum length -** of 50 bytes and add it to the hash. -*/ -static void hash_step_vformat( - SHA3Context *p, /* Add content to this context */ - const char *zFormat, - ... -){ - va_list ap; - int n; - char zBuf[50]; - va_start(ap, zFormat); - sqlite3_vsnprintf(sizeof(zBuf),zBuf,zFormat,ap); - va_end(ap); - n = (int)strlen(zBuf); - SHA3Update(p, (unsigned char*)zBuf, n); -} - -/* -** Implementation of the sha3_query(SQL,SIZE) function. -** -** This function compiles and runs the SQL statement(s) given in the -** argument. The results are hashed using a SIZE-bit SHA3. The default -** size is 256. -** -** The format of the byte stream that is hashed is summarized as follows: -** -** S: -** R -** N -** I -** F -** B: -** T: -** -** is the original SQL text for each statement run and is -** the size of that text. The SQL text is UTF-8. A single R character -** occurs before the start of each row. N means a NULL value. -** I mean an 8-byte little-endian integer . F is a floating point -** number with an 8-byte little-endian IEEE floating point value . -** B means blobs of bytes. T means text rendered as -** bytes of UTF-8. The and values are expressed as an ASCII -** text integers. -** -** For each SQL statement in the X input, there is one S segment. Each -** S segment is followed by zero or more R segments, one for each row in the -** result set. After each R, there are one or more N, I, F, B, or T segments, -** one for each column in the result set. Segments are concatentated directly -** with no delimiters of any kind. -*/ -static void sha3QueryFunc( - sqlite3_context *context, - int argc, - sqlite3_value **argv -){ - sqlite3 *db = sqlite3_context_db_handle(context); - const char *zSql = (const char*)sqlite3_value_text(argv[0]); - sqlite3_stmt *pStmt = 0; - int nCol; /* Number of columns in the result set */ - int i; /* Loop counter */ - int rc; - int n; - const char *z; - SHA3Context cx; - int iSize; - - if( argc==1 ){ - iSize = 256; - }else{ - iSize = sqlite3_value_int(argv[1]); - if( iSize!=224 && iSize!=256 && iSize!=384 && iSize!=512 ){ - sqlite3_result_error(context, "SHA3 size should be one of: 224 256 " - "384 512", -1); - return; - } - } - if( zSql==0 ) return; - SHA3Init(&cx, iSize); - while( zSql[0] ){ - rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, &zSql); - if( rc ){ - char *zMsg = sqlite3_mprintf("error SQL statement [%s]: %s", - zSql, sqlite3_errmsg(db)); - sqlite3_finalize(pStmt); - sqlite3_result_error(context, zMsg, -1); - sqlite3_free(zMsg); - return; - } - if( !sqlite3_stmt_readonly(pStmt) ){ - char *zMsg = sqlite3_mprintf("non-query: [%s]", sqlite3_sql(pStmt)); - sqlite3_finalize(pStmt); - sqlite3_result_error(context, zMsg, -1); - sqlite3_free(zMsg); - return; - } - nCol = sqlite3_column_count(pStmt); - z = sqlite3_sql(pStmt); - n = (int)strlen(z); - hash_step_vformat(&cx,"S%d:",n); - SHA3Update(&cx,(unsigned char*)z,n); - - /* Compute a hash over the result of the query */ - while( SQLITE_ROW==sqlite3_step(pStmt) ){ - SHA3Update(&cx,(const unsigned char*)"R",1); - for(i=0; i=1; j--){ - x[j] = u & 0xff; - u >>= 8; - } - x[0] = 'I'; - SHA3Update(&cx, x, 9); - break; - } - case SQLITE_FLOAT: { - sqlite3_uint64 u; - int j; - unsigned char x[9]; - double r = sqlite3_column_double(pStmt,i); - memcpy(&u, &r, 8); - for(j=8; j>=1; j--){ - x[j] = u & 0xff; - u >>= 8; - } - x[0] = 'F'; - SHA3Update(&cx,x,9); - break; - } - case SQLITE_TEXT: { - int n2 = sqlite3_column_bytes(pStmt, i); - const unsigned char *z2 = sqlite3_column_text(pStmt, i); - hash_step_vformat(&cx,"T%d:",n2); - SHA3Update(&cx, z2, n2); - break; - } - case SQLITE_BLOB: { - int n2 = sqlite3_column_bytes(pStmt, i); - const unsigned char *z2 = sqlite3_column_blob(pStmt, i); - hash_step_vformat(&cx,"B%d:",n2); - SHA3Update(&cx, z2, n2); - break; - } - } - } - } - sqlite3_finalize(pStmt); - } - sqlite3_result_blob(context, SHA3Final(&cx), iSize/8, SQLITE_TRANSIENT); -} - - #ifdef _WIN32 #endif @@ -2126,16 +1965,6 @@ int sqlite3_shathree_init( SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC, 0, sha3Func, 0, 0); } - if( rc==SQLITE_OK ){ - rc = sqlite3_create_function(db, "sha3_query", 1, - SQLITE_UTF8 | SQLITE_DIRECTONLY, - 0, sha3QueryFunc, 0, 0); - } - if( rc==SQLITE_OK ){ - rc = sqlite3_create_function(db, "sha3_query", 2, - SQLITE_UTF8 | SQLITE_DIRECTONLY, - 0, sha3QueryFunc, 0, 0); - } return rc; } @@ -14081,36 +13910,6 @@ static unsigned char *readHexDb(ShellState *p, int *pnData){ } #endif /* SQLITE_ENABLE_DESERIALIZE */ -/* -** Scalar function "shell_int32". The first argument to this function -** must be a blob. The second a non-negative integer. This function -** reads and returns a 32-bit big-endian integer from byte -** offset (4*) of the blob. -*/ -static void shellInt32( - sqlite3_context *context, - int argc, - sqlite3_value **argv -){ - const unsigned char *pBlob; - int nBlob; - int iInt; - - UNUSED_PARAMETER(argc); - nBlob = sqlite3_value_bytes(argv[0]); - pBlob = (const unsigned char*)sqlite3_value_blob(argv[0]); - iInt = sqlite3_value_int(argv[1]); - - if( iInt>=0 && (iInt+1)*4<=nBlob ){ - const unsigned char *a = &pBlob[iInt*4]; - sqlite3_int64 iVal = ((sqlite3_int64)a[0]<<24) - + ((sqlite3_int64)a[1]<<16) - + ((sqlite3_int64)a[2]<< 8) - + ((sqlite3_int64)a[3]<< 0); - sqlite3_result_int64(context, iVal); - } -} - /* ** Scalar function "shell_idquote(X)" returns string X quoted as an identifier, ** using "..." with internal double-quote characters doubled. @@ -14288,8 +14087,6 @@ static void open_db(ShellState *p, int openFlags){ sqlite3_fileio_init(p->db, 0, 0); sqlite3_shathree_init(p->db, 0, 0); sqlite3_completion_init(p->db, 0, 0); - sqlite3_uint_init(p->db, 0, 0); - sqlite3_decimal_init(p->db, 0, 0); #if !defined(SQLITE_OMIT_VIRTUALTABLE) && defined(SQLITE_ENABLE_DBPAGE_VTAB) sqlite3_dbdata_init(p->db, 0, 0); #endif @@ -14305,8 +14102,6 @@ static void open_db(ShellState *p, int openFlags){ shellPutsFunc, 0, 0); sqlite3_create_function(p->db, "shell_escape_crnl", 1, SQLITE_UTF8, 0, shellEscapeCrnl, 0, 0); - sqlite3_create_function(p->db, "shell_int32", 2, SQLITE_UTF8, 0, - shellInt32, 0, 0); sqlite3_create_function(p->db, "shell_idquote", 1, SQLITE_UTF8, 0, shellIdQuote, 0, 0); #ifndef SQLITE_NOHAVE_SYSTEM diff --git a/tools/sqlite3_api_wrapper/sqlite3_api_wrapper.cpp b/tools/sqlite3_api_wrapper/sqlite3_api_wrapper.cpp index ad2cac292e6d..4d9a008e2457 100644 --- a/tools/sqlite3_api_wrapper/sqlite3_api_wrapper.cpp +++ b/tools/sqlite3_api_wrapper/sqlite3_api_wrapper.cpp @@ -1076,8 +1076,29 @@ int sqlite3_get_autocommit(sqlite3 *db) { } int sqlite3_limit(sqlite3 *, int id, int newVal) { - fprintf(stderr, "sqlite3_limit: unsupported.\n"); - return -1; + if (newVal >= 0) { + // attempting to set limit value + return SQLITE_OK; + } + switch (id) { + case SQLITE_LIMIT_LENGTH: + case SQLITE_LIMIT_SQL_LENGTH: + case SQLITE_LIMIT_COLUMN: + case SQLITE_LIMIT_LIKE_PATTERN_LENGTH: + return std::numeric_limits::max(); + case SQLITE_LIMIT_EXPR_DEPTH: + return 1000; + case SQLITE_LIMIT_FUNCTION_ARG: + case SQLITE_LIMIT_VARIABLE_NUMBER: + return 256; + case SQLITE_LIMIT_ATTACHED: + return 1000; + case SQLITE_LIMIT_WORKER_THREADS: + case SQLITE_LIMIT_TRIGGER_DEPTH: + return 0; + default: + return SQLITE_ERROR; + } } int sqlite3_stmt_readonly(sqlite3_stmt *pStmt) { From ad79569204a1f2f20562656d0863a26ad791b175 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:35:44 -0400 Subject: [PATCH 129/147] pyodide build --- .github/workflows/Pyodide.yml | 99 +++++++++++++++++++ .gitignore | 4 + tools/pythonpkg/pyodide.md | 57 +++++++++++ tools/pythonpkg/setup.py | 10 ++ tools/pythonpkg/src/native/python_objects.cpp | 11 ++- 5 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/Pyodide.yml create mode 100644 tools/pythonpkg/pyodide.md diff --git a/.github/workflows/Pyodide.yml b/.github/workflows/Pyodide.yml new file mode 100644 index 000000000000..290024c3c190 --- /dev/null +++ b/.github/workflows/Pyodide.yml @@ -0,0 +1,99 @@ +name: Pyodide +on: + workflow_call: + inputs: + override_git_describe: + type: string + git_ref: + type: string + skip_tests: + type: string + workflow_dispatch: + inputs: + override_git_describe: + type: string + git_ref: + type: string + skip_tests: + type: string + repository_dispatch: + push: + branches: + - "**" + - "!main" + - "!feature" + paths-ignore: + - "**" + - "!.github/workflows/Pyodide.yml" + + pull_request: + types: [opened, reopened, ready_for_review] + paths-ignore: + - "**" + - "!.github/workflows/Pyodide.yml" + +jobs: + build_pyodide: + name: Build pyodide wheel + runs-on: ubuntu-22.04 + strategy: + fail-fast: false + matrix: + version: + - python: "3.10" + pyodide-build: "0.22.1" + - python: "3.11" + pyodide-build: "0.25.1" + steps: + - uses: actions/checkout@v4 + with: + # fetch everything so that the version on the built wheel path is + # correct + fetch-depth: 0 + + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.version.python }} + + - run: pip install 'pyodide-build==${{ matrix.version.pyodide-build }}' 'pydantic<2' + + - name: get emscripten version + id: emscripten-version + run: | + echo "value=$(pyodide config get emscripten_version)" | tee -a "$GITHUB_OUTPUT" + + - uses: mymindstorm/setup-emsdk@v14 + with: + version: ${{ steps.emscripten-version.outputs.value }} + + - name: build wasm wheel + run: pyodide build --exports=whole_archive + working-directory: ./tools/pythonpkg + env: + DUCKDB_CUSTOM_PLATFORM: wasm_eh_pyodide + CFLAGS: "-fexceptions" + LDFLAGS: "-fexceptions" + + - name: smoke test duckdb on pyodide + run: | + pyodide venv .venv-pyodide + source .venv-pyodide/bin/activate + pip install ./tools/pythonpkg/dist/*.whl + + python -V + + python < + + + + + + + + +``` + +## Caveats + +Only Pythons 3.10 and 3.11 are supported right now, with 3.12 support on the way. + +Wheels are tied to a specific version of Pyodide. For example when using +Pyodide version 0.25.1, you must use the cp311-based wheel. + +Some functionality is known to not work, such as extension downloading. + +The default extensions (as well as the `httpfs` extension) that ship with +duckdb Python don't need to be `INSTALL`ed, but others, like `spatial`, won't +work because they cannot be downloaded in the pyodide runtime. diff --git a/tools/pythonpkg/setup.py b/tools/pythonpkg/setup.py index 41ee53f32b92..76ba6a2d13d0 100644 --- a/tools/pythonpkg/setup.py +++ b/tools/pythonpkg/setup.py @@ -121,8 +121,10 @@ class build_ext(CompilerLauncherMixin, _build_ext): extensions = ['parquet', 'icu', 'fts', 'tpch', 'json'] is_android = hasattr(sys, 'getandroidapilevel') +is_pyodide = 'PYODIDE' in os.environ use_jemalloc = ( not is_android + and not is_pyodide and platform.system() == 'Linux' and platform.architecture()[0] == '64bit' and platform.machine() == 'x86_64' @@ -183,12 +185,20 @@ def open_utf8(fpath, flags): define_macros = [('DUCKDB_PYTHON_LIB_NAME', lib_name)] +custom_platform = os.environ.get('DUCKDB_CUSTOM_PLATFORM') +if custom_platform is not None: + define_macros.append(('DUCKDB_CUSTOM_PLATFORM', custom_platform)) + if platform.system() == 'Darwin': toolchain_args.extend(['-stdlib=libc++', '-mmacosx-version-min=10.7']) if platform.system() == 'Windows': define_macros.extend([('DUCKDB_BUILD_LIBRARY', None), ('WIN32', None)]) +if is_pyodide: + # show more useful error messages in the browser + define_macros.append(('PYBIND11_DETAILED_ERROR_MESSAGES', None)) + if 'BUILD_HTTPFS' in os.environ: libraries += ['crypto', 'ssl'] extensions += ['httpfs'] diff --git a/tools/pythonpkg/src/native/python_objects.cpp b/tools/pythonpkg/src/native/python_objects.cpp index 44d0acca4bf7..5c19f2ae13e2 100644 --- a/tools/pythonpkg/src/native/python_objects.cpp +++ b/tools/pythonpkg/src/native/python_objects.cpp @@ -575,7 +575,16 @@ py::object PythonObject::FromValue(const Value &val, const LogicalType &type, auto &array_values = ArrayValue::GetChildren(val); auto array_size = ArrayType::GetSize(type); auto &child_type = ArrayType::GetChildType(type); - py::tuple arr(array_size); + + // do not remove the static cast here, it's required for building + // duckdb-python with Emscripten. + // + // without this cast, a static_assert fails in pybind11 + // because the return type of ArrayType::GetSize is idx_t, + // which is typedef'd to uint64_t and ssize_t is 4 bytes with Emscripten + // and pybind11 requires that the input be castable to ssize_t + py::tuple arr(static_cast(array_size)); + for (idx_t elem_idx = 0; elem_idx < array_size; elem_idx++) { arr[elem_idx] = FromValue(array_values[elem_idx], child_type, client_properties); } From 3e43b6bdfafce142e463c411e658fa7314d7bf77 Mon Sep 17 00:00:00 2001 From: Tishj Date: Thu, 11 Apr 2024 13:04:23 +0200 Subject: [PATCH 130/147] let the generation be modular by default, only replace the extensions that were not build --- scripts/generate_extensions_function.py | 129 +++++++++++++----------- 1 file changed, 72 insertions(+), 57 deletions(-) diff --git a/scripts/generate_extensions_function.py b/scripts/generate_extensions_function.py index c95fa95f4655..316c50617e46 100644 --- a/scripts/generate_extensions_function.py +++ b/scripts/generate_extensions_function.py @@ -32,7 +32,6 @@ from enum import Enum - class CatalogType(str, Enum): SCALAR = "CatalogType::SCALAR_FUNCTION_ENTRY" TABLE = "CatalogType::TABLE_FUNCTION_ENTRY" @@ -125,6 +124,60 @@ def create_map(input: List[Tuple[str, str]]) -> Dict[str, "ExtensionType"]: return output +class ParsedEntries: + def __init__(self, file_path): + self.path = file_path + self.functions = {} + self.settings = {} + self.types = {} + self.copy_functions = {} + + def parse_contents(input) -> list: + # Split the string by comma and remove any leading or trailing spaces + elements = input.split(",") + # Strip any leading or trailing spaces and surrounding double quotes from each element + elements = [element.strip().strip('"') for element in elements] + return elements + + file = open(file_path, 'r') + pattern = re.compile("{(.*(?:, )?)}[,}\n]") + file_blob = file.read() + + # Get the extension functions + ext_functions_file_blob = get_slice_of_file("EXTENSION_FUNCTIONS", file_blob) + res = pattern.findall(ext_functions_file_blob) + res = [parse_contents(x) for x in res] + res = [(x[0], x[1], x[2]) for x in res] + self.functions = ExtensionFunction.create_map(res) + + # Get the extension settings + ext_settings_file_blob = get_slice_of_file("EXTENSION_SETTINGS", file_blob) + res = pattern.findall(ext_settings_file_blob) + res = [parse_contents(x) for x in res] + res = [(x[0], x[1]) for x in res] + self.settings = ExtensionSetting.create_map(res) + + # Get the extension types + ext_copy_functions_blob = get_slice_of_file("EXTENSION_COPY_FUNCTIONS", file_blob) + res = pattern.findall(ext_copy_functions_blob) + res = [parse_contents(x) for x in res] + res = [(x[0], x[1]) for x in res] + self.copy_functions = ExtensionCopyFunction.create_map(res) + + # Get the extension types + ext_types_file_blob = get_slice_of_file("EXTENSION_TYPES", file_blob) + res = pattern.findall(ext_types_file_blob) + res = [parse_contents(x) for x in res] + res = [(x[0], x[1]) for x in res] + self.types = ExtensionType.create_map(res) + + def filter_entries(self, extensions: List[str]): + self.functions = {k: v for k, v in self.functions.items() if v.extension not in extensions} + self.copy_functions = {k: v for k, v in self.copy_functions.items() if v.extension not in extensions} + self.settings = {k: v for k, v in self.settings.items() if v.extension not in extensions} + self.types = {k: v for k, v in self.types.items() if v.extension not in extensions} + + def check_prerequisites(): if not os.path.isfile(EXTENSIONS_PATH) or not os.path.isfile(DUCKDB_PATH): print( @@ -210,6 +263,10 @@ def set_base(self): self.base_functions: Set[Function] = get_functions() self.base_settings: Set[str] = get_settings() + def add_entries(self, entries: ParsedEntries): + self.function_map.update(entries.functions) + self.settings_map.update(entries.settings) + def add_extension(self, extension_name: str): if extension_name in self.extensions: # Perform a LOAD and add the added settings/functions @@ -259,19 +316,19 @@ def add_functions(self, extension_name: str, function_list: List[Function]): self.function_map.update(functions_to_add) def validate(self): - parsed_entries = parse_extension_entries(HEADER_PATH) - if self.function_map != parsed_entries['functions']: + parsed_entries = ParsedEntries(HEADER_PATH) + if self.function_map != parsed_entries.functions: print("Function map mismatches:") - print_map_diff(self.function_map, parsed_entries['functions']) + print_map_diff(self.function_map, parsed_entries.functions) exit(1) - if self.settings_map != parsed_entries['settings']: + if self.settings_map != parsed_entries.settings: print("Settings map mismatches:") - print_map_diff(self.settings_map, parsed_entries['settings']) + print_map_diff(self.settings_map, parsed_entries.settings) exit(1) print("All entries found: ") - print(" > functions: " + str(len(parsed_entries['functions']))) - print(" > settings: " + str(len(parsed_entries['settings']))) + print(" > functions: " + str(len(parsed_entries.functions))) + print(" > settings: " + str(len(parsed_entries.settings))) def verify_export(self): if len(self.function_map) == 0 or len(self.settings_map) == 0: @@ -317,55 +374,6 @@ def get_slice_of_file(var_name, file_str): return file_str[begin:end] -# Parses the extension_entries.hpp file -def parse_extension_entries(file_path): - def parse_contents(input) -> list: - # Split the string by comma and remove any leading or trailing spaces - elements = input.split(",") - # Strip any leading or trailing spaces and surrounding double quotes from each element - elements = [element.strip().strip('"') for element in elements] - return elements - - file = open(file_path, 'r') - pattern = re.compile("{(.*(?:, )?)}[,}\n]") - file_blob = file.read() - - # Get the extension functions - ext_functions_file_blob = get_slice_of_file("EXTENSION_FUNCTIONS", file_blob) - res = pattern.findall(ext_functions_file_blob) - res = [parse_contents(x) for x in res] - res = [(x[0], x[1], x[2]) for x in res] - cur_function_map = ExtensionFunction.create_map(res) - - # Get the extension settings - ext_settings_file_blob = get_slice_of_file("EXTENSION_SETTINGS", file_blob) - res = pattern.findall(ext_settings_file_blob) - res = [parse_contents(x) for x in res] - res = [(x[0], x[1]) for x in res] - cur_settings_map = ExtensionSetting.create_map(res) - - # Get the extension types - ext_copy_functions_blob = get_slice_of_file("EXTENSION_COPY_FUNCTIONS", file_blob) - res = pattern.findall(ext_copy_functions_blob) - res = [parse_contents(x) for x in res] - res = [(x[0], x[1]) for x in res] - cur_copy_functions_map = ExtensionCopyFunction.create_map(res) - - # Get the extension types - ext_types_file_blob = get_slice_of_file("EXTENSION_TYPES", file_blob) - res = pattern.findall(ext_types_file_blob) - res = [parse_contents(x) for x in res] - res = [(x[0], x[1]) for x in res] - cur_types_map = ExtensionType.create_map(res) - - return { - 'functions': cur_function_map, - 'settings': cur_settings_map, - 'types': cur_types_map, - 'copy_functions': cur_copy_functions_map, - } - - def print_map_diff(d1, d2): s1 = sorted(set(d1.items())) s2 = sorted(set(d2.items())) @@ -551,11 +559,18 @@ def main(): # Collect the list of functions/settings without any extensions loaded extension_data.set_base() + # TODO: add 'purge' option to ignore existing entries ?? + parsed_entries = ParsedEntries(HEADER_PATH) + parsed_entries.filter_entries(extension_names) + for extension_name in extension_names: print(extension_name) # For every extension, add the functions/settings added by the extension extension_data.add_extension(extension_name) + # Add the entries we initially parsed from the HEADER_PATH + extension_data.add_entries(parsed_entries) + if args.validate: extension_data.validate() return From 4465345de395c12cfda6a5d831aa7c2e1c1186f0 Mon Sep 17 00:00:00 2001 From: Tishj Date: Thu, 11 Apr 2024 13:32:52 +0200 Subject: [PATCH 131/147] statically link mbedtls into parquet, as it's required to load parquet dynamically --- extension/parquet/CMakeLists.txt | 1 + scripts/generate_extensions_function.py | 1 + 2 files changed, 2 insertions(+) diff --git a/extension/parquet/CMakeLists.txt b/extension/parquet/CMakeLists.txt index 718bb3af91ae..f4d415dbeeb9 100644 --- a/extension/parquet/CMakeLists.txt +++ b/extension/parquet/CMakeLists.txt @@ -66,6 +66,7 @@ endif() build_static_extension(parquet ${PARQUET_EXTENSION_FILES}) set(PARAMETERS "-warnings") build_loadable_extension(parquet ${PARAMETERS} ${PARQUET_EXTENSION_FILES}) +target_link_libraries(parquet_loadable_extension duckdb_mbedtls) install( TARGETS parquet_extension diff --git a/scripts/generate_extensions_function.py b/scripts/generate_extensions_function.py index 316c50617e46..be00d406eda0 100644 --- a/scripts/generate_extensions_function.py +++ b/scripts/generate_extensions_function.py @@ -32,6 +32,7 @@ from enum import Enum + class CatalogType(str, Enum): SCALAR = "CatalogType::SCALAR_FUNCTION_ENTRY" TABLE = "CatalogType::TABLE_FUNCTION_ENTRY" From 453b32aaf518539c6a2b1449572d7dafb437b397 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 15:28:22 +0200 Subject: [PATCH 132/147] array_length should return NULL when the input value is NULL for consistency --- src/function/scalar/string/length.cpp | 25 +++++++++++++++++-- test/sql/function/array/array_length.test | 30 ++++++++++++++++++++++- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/src/function/scalar/string/length.cpp b/src/function/scalar/string/length.cpp index 218e4e84626d..dd88fd8d1441 100644 --- a/src/function/scalar/string/length.cpp +++ b/src/function/scalar/string/length.cpp @@ -70,7 +70,6 @@ static unique_ptr LengthPropagateStats(ClientContext &context, F //------------------------------------------------------------------ // ARRAY / LIST LENGTH //------------------------------------------------------------------ - static void ListLengthFunction(DataChunk &args, ExpressionState &state, Vector &result) { auto &input = args.data[0]; D_ASSERT(input.GetType().id() == LogicalTypeId::LIST); @@ -83,9 +82,31 @@ static void ListLengthFunction(DataChunk &args, ExpressionState &state, Vector & static void ArrayLengthFunction(DataChunk &args, ExpressionState &state, Vector &result) { auto &input = args.data[0]; - // If the input is an array, the length is constant + + UnifiedVectorFormat format; + args.data[0].ToUnifiedFormat(args.size(), format); + + // for arrays the length is constant result.SetVectorType(VectorType::CONSTANT_VECTOR); ConstantVector::GetData(result)[0] = static_cast(ArrayType::GetSize(input.GetType())); + + // but we do need to take null values into account + if (format.validity.AllValid()) { + // if there are no null values we can just return the constant + return; + } + // otherwise we flatten and inherit the null values of the parent + result.Flatten(args.size()); + auto &result_validity = FlatVector::Validity(result); + for (idx_t r = 0; r < args.size(); r++) { + auto idx = format.sel->get_index(r); + if (!format.validity.RowIsValid(idx)) { + result_validity.SetInvalid(r); + } + } + if (args.AllConstant()) { + result.SetVectorType(VectorType::CONSTANT_VECTOR); + } } static unique_ptr ArrayOrListLengthBind(ClientContext &context, ScalarFunction &bound_function, diff --git a/test/sql/function/array/array_length.test b/test/sql/function/array/array_length.test index 4c82583007f0..b82ec6a05e41 100644 --- a/test/sql/function/array/array_length.test +++ b/test/sql/function/array/array_length.test @@ -10,6 +10,35 @@ SELECT length(array_value(1, 2, 3)); ---- 3 +# array length for NULL values +statement ok +create table arrays(a int[3]); + +statement ok +insert into arrays values ([1, 2, 3]), ([4, 5, 6]) + +query I +select length(a) from arrays; +---- +3 +3 + +query I +select length(NULL::int[3]) from arrays; +---- +NULL +NULL + +statement ok +insert into arrays values (NULL); + +query I +select length(a) from arrays; +---- +3 +3 +NULL + # Array length with dimension argument query I SELECT array_length(array_value(array_value(1, 2, 2), array_value(3, 4, 3)), 1); @@ -26,7 +55,6 @@ SELECT array_length(array_value(array_value(1, 2, 2), array_value(3, 4, 3)), 3); ---- Out of Range Error: array_length dimension '3' out of range (min: '1', max: '2') - statement error SELECT array_length(array_value(array_value(1, 2, 2), array_value(3, 4, 3)), 0); ---- From 13e70f797a4744b9f8e4146869c161a39376370a Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 15:38:03 +0200 Subject: [PATCH 133/147] Update fuzzer scripts --- scripts/fuzzer_helper.py | 89 ++++++++++++++++++++++------------------ scripts/reduce_sql.py | 21 +++------- scripts/run_fuzzer.py | 46 +++++++++------------ 3 files changed, 72 insertions(+), 84 deletions(-) diff --git a/scripts/fuzzer_helper.py b/scripts/fuzzer_helper.py index e36275e9b0c5..9d73f1985243 100644 --- a/scripts/fuzzer_helper.py +++ b/scripts/fuzzer_helper.py @@ -41,31 +41,24 @@ footer = ''' ```''' - -def get_github_hash(): - proc = subprocess.Popen(['git', 'rev-parse', 'HEAD'], stdout=subprocess.PIPE) - return proc.stdout.read().decode('utf8').strip() - - # github stuff def issue_url(): return 'https://api.github.com/repos/%s/%s/issues' % (REPO_OWNER, REPO_NAME) - def create_session(): # Create an authenticated session to create the issue session = requests.Session() session.headers.update({'Authorization': 'token %s' % (TOKEN,)}) return session - def make_github_issue(title, body): if len(title) > 240: # avoid title is too long error (maximum is 256 characters) title = title[:240] + '...' session = create_session() url = issue_url() - issue = {'title': title, 'body': body} + issue = {'title': title, + 'body': body} r = session.post(url, json.dumps(issue)) if r.status_code == 201: print('Successfully created Issue "%s"' % title) @@ -74,10 +67,9 @@ def make_github_issue(title, body): print('Response:', r.content.decode('utf8')) raise Exception("Failed to create issue") - -def get_github_issues(): +def get_github_issues(page): session = create_session() - url = issue_url() + url = issue_url()+'?per_page=100&page='+str(page) r = session.get(url) if r.status_code != 200: print('Failed to get list of issues') @@ -85,7 +77,6 @@ def get_github_issues(): raise Exception("Failed to get list of issues") return json.loads(r.content.decode('utf8')) - def close_github_issue(number): session = create_session() url = issue_url() + '/' + str(number) @@ -98,73 +89,89 @@ def close_github_issue(number): print('Response:', r.content.decode('utf8')) raise Exception("Failed to close issue") +def label_github_issue(number, label): + session = create_session() + url = issue_url() + '/' + str(number) + params = {'labels': [label]} + r = session.patch(url, json.dumps(params)) + if r.status_code == 200: + print(f'Successfully labeled Issue "{number}"') + else: + print(f'Could not label Issue "{number}" (status code {r.status_code})') + print('Response:', r.content.decode('utf8')) + raise Exception("Failed to label issue") def extract_issue(body, nr): try: splits = body.split(middle) sql = splits[0].split(header)[1] - error = splits[1][: -len(footer)] + error = splits[1][:-len(footer)] return (sql, error) except: print(f"Failed to extract SQL/error message from issue {nr}") print(body) return None - def run_shell_command_batch(shell, cmd): command = [shell, '--batch', '-init', '/dev/null'] - res = subprocess.run(command, input=bytearray(cmd, 'utf8'), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + try: + res = subprocess.run(command, input=bytearray(cmd, 'utf8'), stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=300) + except subprocess.TimeoutExpired: + print(f"TIMEOUT... {cmd}") + return ("", "", 0, True) stdout = res.stdout.decode('utf8').strip() stderr = res.stderr.decode('utf8').strip() - return (stdout, stderr, res.returncode) - + return (stdout, stderr, res.returncode, False) -def test_reproducibility(shell, issue, current_errors): +def test_reproducibility(shell, issue, current_errors, perform_check): extract = extract_issue(issue['body'], issue['number']) + labels = issue['labels'] + label_timeout = False + for label in labels: + if label['name'] == 'timeout': + label_timeout = True if extract is None: # failed extract: leave the issue as-is return True sql = extract[0] + ';' error = extract[1] - (stdout, stderr, returncode) = run_shell_command_batch(shell, sql) - if returncode == 0: - return False - if not fuzzer_helper.is_internal_error(stderr): - return False + if perform_check is True and label_timeout is False: + print(f"Checking issue {issue['number']}...") + (stdout, stderr, returncode, is_timeout) = run_shell_command_batch(shell, sql) + if is_timeout: + label_github_issue(issue['number'], 'timeout') + else: + if returncode == 0: + return False + if not fuzzer_helper.is_internal_error(stderr): + return False # issue is still reproducible current_errors[error] = issue return True - -def extract_github_issues(shell): +def extract_github_issues(shell, perform_check): current_errors = dict() - issues = get_github_issues() - for issue in issues: - # check if the github issue is still reproducible - if not test_reproducibility(shell, issue, current_errors): - # the issue appears to be fixed - close the issue - print(f"Failed to reproduce issue {issue['number']}, closing...") - close_github_issue(int(issue['number'])) + for p in range(1,10): + issues = get_github_issues(p) + for issue in issues: + # check if the github issue is still reproducible + if not test_reproducibility(shell, issue, current_errors, perform_check): + # the issue appears to be fixed - close the issue + print(f"Failed to reproduce issue {issue['number']}, closing...") + close_github_issue(int(issue['number'])) return current_errors - def file_issue(cmd, error_msg, fuzzer, seed, hash): # issue is new, file it print("Filing new issue to Github") title = error_msg - body = ( - fuzzer_desc.replace("${FUZZER}", fuzzer) - .replace("${FULL_HASH}", hash) - .replace("${SHORT_HASH}", hash[:5]) - .replace("${SEED}", str(seed)) - ) + body = fuzzer_desc.replace("${FUZZER}", fuzzer).replace("${FULL_HASH}", hash).replace("${SHORT_HASH}", hash[:5]).replace("${SEED}", str(seed)) body += header + cmd + middle + error_msg + footer print(title, body) make_github_issue(title, body) - def is_internal_error(error): if 'differs from original result' in error: return True diff --git a/scripts/reduce_sql.py b/scripts/reduce_sql.py index e266fd1d1aa0..f81669a73391 100644 --- a/scripts/reduce_sql.py +++ b/scripts/reduce_sql.py @@ -11,9 +11,8 @@ SELECT * FROM reduce_sql_statement('${QUERY}'); ''' - def sanitize_error(err): - err = re.sub(r'Error: near line \d+: ', '', err) + err = re.sub('Error: near line \d+: ', '', err) err = err.replace(os.getcwd() + '/', '') err = err.replace(os.getcwd(), '') if 'AddressSanitizer' in err: @@ -21,7 +20,6 @@ def sanitize_error(err): err = 'AddressSanitizer error ' + match return err - def run_shell_command(shell, cmd): command = [shell, '-csv', '--batch', '-init', '/dev/null'] @@ -30,7 +28,6 @@ def run_shell_command(shell, cmd): stderr = res.stderr.decode('utf8').strip() return (stdout, stderr, res.returncode) - def get_reduced_sql(shell, sql_query): reduce_query = get_reduced_query.replace('${QUERY}', sql_query.replace("'", "''")) (stdout, stderr, returncode) = run_shell_command(shell, reduce_query) @@ -43,7 +40,6 @@ def get_reduced_sql(shell, sql_query): reduce_candidates.append(line.strip('"').replace('""', '"')) return reduce_candidates[1:] - def reduce(sql_query, data_load, shell, error_msg, max_time_seconds=300): start = time.time() while True: @@ -70,22 +66,18 @@ def reduce(sql_query, data_load, shell, error_msg, max_time_seconds=300): break return sql_query - def is_ddl_query(query): query = query.lower() if 'create' in query or 'insert' in query or 'update' in query or 'delete' in query: return True return False - def initial_cleanup(query_log): query_log = query_log.replace('SELECT * FROM pragma_version()\n', '') return query_log - def run_queries_until_crash_mp(queries, result_file): import duckdb - con = duckdb.connect() sqlite_con = sqlite3.connect(result_file) sqlite_con.execute('CREATE TABLE queries(id INT, text VARCHAR)') @@ -110,7 +102,7 @@ def run_queries_until_crash_mp(queries, result_file): keep_query = True sqlite_con.execute('UPDATE result SET text=?', (exception_error,)) if not keep_query: - sqlite_con.execute('DELETE FROM queries WHERE id=?', (id,)) + sqlite_con.execute('DELETE FROM queries WHERE id=?', (id, )) if is_internal_error: # found internal error: no need to try further queries break @@ -121,7 +113,6 @@ def run_queries_until_crash_mp(queries, result_file): sqlite_con.commit() sqlite_con.close() - def run_queries_until_crash(queries): sqlite_file = 'cleaned_queries.db' if os.path.isfile(sqlite_file): @@ -149,10 +140,8 @@ def cleanup_irrelevant_queries(query_log): queries = [x for x in query_log.split(';\n') if len(x) > 0] return run_queries_until_crash(queries) - # def reduce_internal(start, sql_query, data_load, queries_final, shell, error_msg, max_time_seconds=300): - def reduce_query_log_query(start, shell, queries, query_index, max_time_seconds): new_query_list = queries[:] sql_query = queries[query_index] @@ -184,7 +173,6 @@ def reduce_query_log_query(start, shell, queries, query_index, max_time_seconds) break return sql_query - def reduce_query_log(queries, shell, max_time_seconds=300): start = time.time() current_index = 0 @@ -195,7 +183,7 @@ def reduce_query_log(queries, shell, max_time_seconds=300): if current_time - start > max_time_seconds: break # remove the query at "current_index" - new_queries = queries[:current_index] + queries[current_index + 1 :] + new_queries = queries[:current_index] + queries[current_index + 1:] # try to run the queries and check if we still get the same error (new_queries_x, current_error) = run_queries_until_crash(new_queries) if current_error is None: @@ -215,6 +203,7 @@ def reduce_query_log(queries, shell, max_time_seconds=300): return queries + # Example usage: # error_msg = 'INTERNAL Error: Assertion triggered in file "/Users/myth/Programs/duckdb-bugfix/src/common/types/data_chunk.cpp" on line 41: !types.empty()' # shell = 'build/debug/duckdb' @@ -271,4 +260,4 @@ def reduce_query_log(queries, shell, max_time_seconds=300): # limit 88 # ''' # -# print(reduce(sql_query, data_load, shell, error_msg)) +# print(reduce(sql_query, data_load, shell, error_msg)) \ No newline at end of file diff --git a/scripts/run_fuzzer.py b/scripts/run_fuzzer.py index 86bf65537495..57f7a677b3fd 100644 --- a/scripts/run_fuzzer.py +++ b/scripts/run_fuzzer.py @@ -12,15 +12,22 @@ fuzzer = None db = None shell = None +perform_checks = True for param in sys.argv: if param == '--sqlsmith': fuzzer = 'sqlsmith' elif param == '--duckfuzz': fuzzer = 'duckfuzz' + elif param == '--duckfuzz_functions': + fuzzer = 'duckfuzz_functions' elif param == '--alltypes': db = 'alltypes' elif param == '--tpch': db = 'tpch' + elif param == '--emptyalltypes': + db = 'emptyalltypes' + elif param == '--no_checks': + perform_checks = False elif param.startswith('--shell='): shell = param.replace('--shell=', '') elif param.startswith('--seed='): @@ -31,7 +38,7 @@ exit(1) if db is None: - print("Unrecognized database to run on, expected either --tpch or --alltypes") + print("Unrecognized database to run on, expected either --tpch, --alltypes or --emptyalltypes") exit(1) if shell is None: @@ -41,18 +48,18 @@ if seed < 0: seed = random.randint(0, 2**30) -git_hash = fuzzer_helper.get_github_hash() - +git_hash = os.getenv('DUCKDB_HASH') def create_db_script(db): if db == 'alltypes': return 'create table all_types as select * exclude(small_enum, medium_enum, large_enum) from test_all_types();' elif db == 'tpch': return 'call dbgen(sf=0.1);' + elif db == 'emptyalltypes': + return 'create table all_types as select * exclude(small_enum, medium_enum, large_enum) from test_all_types() limit 0;' else: raise Exception("Unknown database creation script") - def run_fuzzer_script(fuzzer): if fuzzer == 'sqlsmith': return "call sqlsmith(max_queries=${MAX_QUERIES}, seed=${SEED}, verbose_output=1, log='${LAST_LOG_FILE}', complete_log='${COMPLETE_LOG_FILE}');" @@ -63,7 +70,6 @@ def run_fuzzer_script(fuzzer): else: raise Exception("Unknown fuzzer type") - def get_fuzzer_name(fuzzer): if fuzzer == 'sqlsmith': return 'SQLSmith' @@ -74,7 +80,6 @@ def get_fuzzer_name(fuzzer): else: return 'Unknown' - def run_shell_command(cmd): command = [shell, '--batch', '-init', '/dev/null'] @@ -85,27 +90,19 @@ def run_shell_command(cmd): # first get a list of all github issues, and check if we can still reproduce them -current_errors = fuzzer_helper.extract_github_issues(shell) +current_errors = fuzzer_helper.extract_github_issues(shell, perform_checks) -max_queries = 1000 +max_queries = 2000 last_query_log_file = 'sqlsmith.log' complete_log_file = 'sqlsmith.complete.log' -print( - f'''========================================== +print(f'''========================================== RUNNING {fuzzer} on {db} -==========================================''' -) +==========================================''') load_script = create_db_script(db) fuzzer_name = get_fuzzer_name(fuzzer) -fuzzer = ( - run_fuzzer_script(fuzzer) - .replace('${MAX_QUERIES}', str(max_queries)) - .replace('${LAST_LOG_FILE}', last_query_log_file) - .replace('${COMPLETE_LOG_FILE}', complete_log_file) - .replace('${SEED}', str(seed)) -) +fuzzer = run_fuzzer_script(fuzzer).replace('${MAX_QUERIES}', str(max_queries)).replace('${LAST_LOG_FILE}', last_query_log_file).replace('${COMPLETE_LOG_FILE}', complete_log_file).replace('${SEED}', str(seed)) print(load_script) print(fuzzer) @@ -116,11 +113,9 @@ def run_shell_command(cmd): (stdout, stderr, returncode) = run_shell_command(cmd) -print( - f'''========================================== +print(f'''========================================== FINISHED RUNNING -==========================================''' -) +==========================================''') print("============== STDOUT ================") print(stdout) print("============== STDERR =================") @@ -165,10 +160,7 @@ def run_shell_command(cmd): # check if this is a duplicate issue if error_msg in current_errors: print("Skip filing duplicate issue") - print( - "Issue already exists: https://github.com/duckdb/duckdb-fuzzer/issues/" - + str(current_errors[error_msg]['number']) - ) + print("Issue already exists: https://github.com/duckdb/duckdb-fuzzer/issues/" + str(current_errors[error_msg]['number'])) exit(0) print(last_query) From 22d120f941fe5b16f06b45ed8dea9ba41f4debcf Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 15:49:44 +0200 Subject: [PATCH 134/147] Make it possible to use reduce_sql from the command line --- scripts/fuzzer_helper.py | 28 ++++++++++++++------------- scripts/reduce_sql.py | 42 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/scripts/fuzzer_helper.py b/scripts/fuzzer_helper.py index 9d73f1985243..d457b3172d49 100644 --- a/scripts/fuzzer_helper.py +++ b/scripts/fuzzer_helper.py @@ -6,20 +6,8 @@ import reduce_sql import fuzzer_helper -if 'FUZZEROFDUCKSKEY' not in os.environ: - print("FUZZEROFDUCKSKEY not found in environment variables") - exit(1) USERNAME = 'fuzzerofducks' -TOKEN = os.environ['FUZZEROFDUCKSKEY'] - -if len(TOKEN) == 0: - print("FUZZEROFDUCKSKEY is set but is empty") - exit(1) - -if len(TOKEN) != 40: - print("Incorrect length for FUZZEROFDUCKSKEY") - exit(1) REPO_OWNER = 'duckdb' REPO_NAME = 'duckdb-fuzzer' @@ -45,10 +33,24 @@ def issue_url(): return 'https://api.github.com/repos/%s/%s/issues' % (REPO_OWNER, REPO_NAME) + +def get_token(): + if 'FUZZEROFDUCKSKEY' not in os.environ: + print("FUZZEROFDUCKSKEY not found in environment variables") + exit(1) + token = os.environ['FUZZEROFDUCKSKEY'] + if len(token) == 0: + print("FUZZEROFDUCKSKEY is set but is empty") + exit(1) + + if len(token) != 40: + print("Incorrect length for FUZZEROFDUCKSKEY") + exit(1) + return token def create_session(): # Create an authenticated session to create the issue session = requests.Session() - session.headers.update({'Authorization': 'token %s' % (TOKEN,)}) + session.headers.update({'Authorization': 'token %s' % (get_token(),)}) return session def make_github_issue(title, body): diff --git a/scripts/reduce_sql.py b/scripts/reduce_sql.py index f81669a73391..de30dea09872 100644 --- a/scripts/reduce_sql.py +++ b/scripts/reduce_sql.py @@ -6,13 +6,16 @@ import multiprocessing import sqlite3 -multiprocessing.set_start_method('fork') +try: + multiprocessing.set_start_method('fork') +except RuntimeError: + pass get_reduced_query = ''' SELECT * FROM reduce_sql_statement('${QUERY}'); ''' def sanitize_error(err): - err = re.sub('Error: near line \d+: ', '', err) + err = re.sub(r'Error: near line \d+: ', '', err) err = err.replace(os.getcwd() + '/', '') err = err.replace(os.getcwd(), '') if 'AddressSanitizer' in err: @@ -203,6 +206,41 @@ def reduce_query_log(queries, shell, max_time_seconds=300): return queries +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description='Reduce a problematic SQL query') + parser.add_argument('--shell', dest='shell', action='store', help='Path to the shell executable', default='build/debug/duckdb') + parser.add_argument('--load', dest='load', action='store', help='Path to the data load script', required=True) + parser.add_argument('--exec', dest='exec', action='store', help='Path to the executable script', required=True) + parser.add_argument('--inplace', dest='inplace', action='store_true', help='If true, overrides the exec script with the final query') + parser.add_argument('--max-time', dest='max_time', action='store', help='Maximum time in seconds to run the reducer', default=300) + + args = parser.parse_args() + print("Starting reduce process") + + shell = args.shell + data_load = open(args.load).read() + sql_query = open(args.exec).read() + (stdout, stderr, returncode) = run_shell_command(shell, data_load + sql_query) + expected_error = sanitize_error(stderr) + + print("===================================================") + print("Found expected error") + print("===================================================") + print(expected_error) + print("===================================================") + + + final_query = reduce(sql_query, data_load, shell, expected_error, args.max_time) + print("Found final reduced query") + print("===================================================") + print(final_query) + print("===================================================") + if args.inplace: + print(f"Writing to file {args.exec}") + with open(args.exec, 'w+') as f: + f.write(final_query) + # Example usage: # error_msg = 'INTERNAL Error: Assertion triggered in file "/Users/myth/Programs/duckdb-bugfix/src/common/types/data_chunk.cpp" on line 41: !types.empty()' From 164a61d00d4911e4a388e4241515712d832e949f Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 16:41:15 +0200 Subject: [PATCH 135/147] Expand statement simplifier to recurse correctly in more situations allowing for more complex queries to be simplified more --- .../sqlsmith/include/statement_simplifier.hpp | 12 +- extension/sqlsmith/statement_simplifier.cpp | 119 ++++++++++++------ scripts/reduce_sql.py | 4 + 3 files changed, 87 insertions(+), 48 deletions(-) diff --git a/extension/sqlsmith/include/statement_simplifier.hpp b/extension/sqlsmith/include/statement_simplifier.hpp index cda89a829dd0..2bdf2715cd18 100644 --- a/extension/sqlsmith/include/statement_simplifier.hpp +++ b/extension/sqlsmith/include/statement_simplifier.hpp @@ -50,16 +50,10 @@ class StatementSimplifier { template void SimplifyReplace(T &element, T &other); - template - void SimplifyListReplace(T &element, vector &list); - - template - void SimplifyListReplaceNull(vector &list); - template void SimplifyOptional(duckdb::unique_ptr &opt); - void Simplify(TableRef &ref); + void Simplify(unique_ptr &ref); void Simplify(SelectNode &node); void Simplify(SetOperationNode &node); @@ -69,6 +63,10 @@ class StatementSimplifier { void Simplify(OrderModifier &modifier); void SimplifyExpression(duckdb::unique_ptr &expr); + void SimplifyOptionalExpression(duckdb::unique_ptr &expr); + void SimplifyChildExpression(duckdb::unique_ptr &expr, unique_ptr &child); + void SimplifyExpressionList(duckdb::unique_ptr &expr, vector> &expression_list); + void SimplifyExpressionList(vector> &expression_list, bool is_optional = true); void Simplify(CommonTableExpressionMap &cte_map); void Simplify(UpdateSetInfo &info); diff --git a/extension/sqlsmith/statement_simplifier.cpp b/extension/sqlsmith/statement_simplifier.cpp index 05fd4de43288..edfd3437ee70 100644 --- a/extension/sqlsmith/statement_simplifier.cpp +++ b/extension/sqlsmith/statement_simplifier.cpp @@ -44,21 +44,6 @@ void StatementSimplifier::SimplifyList(vector &list, bool is_optional) { } } -template -void StatementSimplifier::SimplifyListReplaceNull(vector &list) { - for (idx_t i = 0; i < list.size(); i++) { - duckdb::unique_ptr constant = make_uniq(Value()); - SimplifyReplace(list[i], constant); - } -} - -template -void StatementSimplifier::SimplifyListReplace(T &element, vector &list) { - for (idx_t i = 0; i < list.size(); i++) { - SimplifyReplace(element, list[i]); - } -} - template void StatementSimplifier::SimplifyOptional(duckdb::unique_ptr &opt) { if (!opt) { @@ -69,21 +54,24 @@ void StatementSimplifier::SimplifyOptional(duckdb::unique_ptr &opt) { opt = std::move(n); } -void StatementSimplifier::Simplify(TableRef &ref) { - switch (ref.type) { +void StatementSimplifier::Simplify(unique_ptr &ref) { + switch (ref->type) { case TableReferenceType::SUBQUERY: { - auto &subquery = ref.Cast(); + auto &subquery = ref->Cast(); Simplify(*subquery.subquery->node); break; } case TableReferenceType::JOIN: { - auto &cp = ref.Cast(); - Simplify(*cp.left); - Simplify(*cp.right); + auto &cp = ref->Cast(); + Simplify(cp.left); + Simplify(cp.right); + SimplifyOptional(cp.condition); + SimplifyReplace(ref, cp.left); + SimplifyReplace(ref, cp.right); break; } case TableReferenceType::EXPRESSION_LIST: { - auto &expr_list = ref.Cast(); + auto &expr_list = ref->Cast(); if (expr_list.values.size() == 1) { SimplifyList(expr_list.values[0]); } else if (expr_list.values.size() > 1) { @@ -98,18 +86,18 @@ void StatementSimplifier::Simplify(TableRef &ref) { void StatementSimplifier::Simplify(SelectNode &node) { // simplify projection list - SimplifyList(node.select_list, false); + SimplifyExpressionList(node.select_list, false); // from clause SimplifyOptional(node.from_table); // simplify groups SimplifyList(node.groups.grouping_sets); // simplify filters - SimplifyOptional(node.where_clause); - SimplifyOptional(node.having); - SimplifyOptional(node.qualify); + SimplifyOptionalExpression(node.where_clause); + SimplifyOptionalExpression(node.having); + SimplifyOptionalExpression(node.qualify); SimplifyOptional(node.sample); - Simplify(*node.from_table); + Simplify(node.from_table); } void StatementSimplifier::Simplify(SetOperationNode &node) { @@ -154,13 +142,41 @@ void StatementSimplifier::Simplify(QueryNode &node) { SimplifyList(node.modifiers); } +void StatementSimplifier::SimplifyExpressionList(duckdb::unique_ptr &expr, vector> &expression_list) { + for(auto &child : expression_list) { + SimplifyChildExpression(expr, child); + } +} + +void StatementSimplifier::SimplifyExpressionList(vector> &expression_list, bool is_optional) { + SimplifyList(expression_list, is_optional); + for(auto &child : expression_list) { + SimplifyExpression(child); + } +} + +void StatementSimplifier::SimplifyChildExpression(duckdb::unique_ptr &expr, unique_ptr &child) { + if (!child) { + return; + } + SimplifyReplace(expr, child); + SimplifyExpression(child); +} + +void StatementSimplifier::SimplifyOptionalExpression(duckdb::unique_ptr &expr) { + if (!expr) { + return; + } + SimplifyOptional(expr); + SimplifyExpression(expr); +} + void StatementSimplifier::SimplifyExpression(duckdb::unique_ptr &expr) { if (!expr) { return; } auto expr_class = expr->GetExpressionClass(); switch (expr_class) { - case ExpressionClass::COLUMN_REF: case ExpressionClass::CONSTANT: return; default: @@ -171,37 +187,60 @@ void StatementSimplifier::SimplifyExpression(duckdb::unique_ptrCast(); - SimplifyListReplace(expr, conj.children); + SimplifyExpressionList(expr, conj.children); break; } case ExpressionClass::FUNCTION: { auto &func = expr->Cast(); - SimplifyListReplace(expr, func.children); - SimplifyListReplaceNull(func.children); + SimplifyExpressionList(expr, func.children); break; } case ExpressionClass::OPERATOR: { auto &op = expr->Cast(); - SimplifyListReplace(expr, op.children); + SimplifyExpressionList(expr, op.children); break; } case ExpressionClass::CASE: { auto &op = expr->Cast(); - SimplifyReplace(expr, op.else_expr); + SimplifyChildExpression(expr, op.else_expr); for (auto &case_check : op.case_checks) { - SimplifyReplace(expr, case_check.then_expr); - SimplifyReplace(expr, case_check.when_expr); + SimplifyChildExpression(expr, case_check.then_expr); + SimplifyChildExpression(expr, case_check.when_expr); } break; } case ExpressionClass::CAST: { auto &cast = expr->Cast(); - SimplifyReplace(expr, cast.child); + SimplifyChildExpression(expr, cast.child); break; } case ExpressionClass::COLLATE: { auto &collate = expr->Cast(); - SimplifyReplace(expr, collate.child); + SimplifyChildExpression(expr, collate.child); + break; + } + case ExpressionClass::SUBQUERY: { + auto &subq = expr->Cast(); + SimplifyChildExpression(expr, subq.child); + Simplify(*subq.subquery->node); + break; + } + case ExpressionClass::COMPARISON: { + auto &comp = expr->Cast(); + SimplifyChildExpression(expr, comp.left); + SimplifyChildExpression(expr, comp.right); + break; + } + case ExpressionClass::WINDOW: { + auto &window = expr->Cast(); + SimplifyExpressionList(expr, window.children); + SimplifyExpressionList(expr, window.partitions); + SimplifyList(window.orders); + SimplifyChildExpression(expr, window.filter_expr); + SimplifyChildExpression(expr, window.start_expr); + SimplifyChildExpression(expr, window.end_expr); + SimplifyChildExpression(expr, window.offset_expr); + SimplifyChildExpression(expr, window.default_expr); break; } default: @@ -212,7 +251,7 @@ void StatementSimplifier::SimplifyExpression(duckdb::unique_ptr()); break; default: break; @@ -267,9 +306,7 @@ void StatementSimplifier::Simplify(UpdateSetInfo &info) { void StatementSimplifier::Simplify(UpdateStatement &stmt) { Simplify(stmt.cte_map); - if (stmt.from_table) { - Simplify(*stmt.from_table); - } + SimplifyOptional(stmt.from_table); D_ASSERT(stmt.set_info); Simplify(*stmt.set_info); SimplifyList(stmt.returning_list); diff --git a/scripts/reduce_sql.py b/scripts/reduce_sql.py index de30dea09872..92a449b799b1 100644 --- a/scripts/reduce_sql.py +++ b/scripts/reduce_sql.py @@ -6,6 +6,10 @@ import multiprocessing import sqlite3 +# this script can be used as a library, but can also be directly called +# example usage: +# python3 scripts/reduce_sql.py --load load.sql --exec exec.sql + try: multiprocessing.set_start_method('fork') except RuntimeError: From f8f5387da9c5aa272a7d272a3974339ba5b6940e Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Thu, 11 Apr 2024 10:51:48 -0400 Subject: [PATCH 136/147] remove catalog/schema filtering from adbc getobjects --- src/common/adbc/adbc.cpp | 74 ++-- test/api/adbc/test_adbc.cpp | 155 ++++++--- tools/pythonpkg/tests/fast/adbc/test_adbc.py | 336 ++++++++++++++++++- 3 files changed, 445 insertions(+), 120 deletions(-) diff --git a/src/common/adbc/adbc.cpp b/src/common/adbc/adbc.cpp index eb388dc0a305..65a147b08d3e 100644 --- a/src/common/adbc/adbc.cpp +++ b/src/common/adbc/adbc.cpp @@ -994,14 +994,6 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth case ADBC_OBJECT_DEPTH_CATALOGS: // Return metadata on catalogs. query = duckdb::StringUtil::Format(R"( - WITH filtered_schemata AS ( - SELECT - catalog_name, - schema_name, - FROM - information_schema.schemata - WHERE catalog_name NOT IN ('system', 'temp') AND schema_name NOT IN ('information_schema', 'pg_catalog') - ) SELECT catalog_name, []::STRUCT( @@ -1039,7 +1031,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth )[] )[] catalog_db_schemas FROM - filtered_schemata + information_schema.schemata WHERE catalog_name LIKE '%s' GROUP BY catalog_name )", @@ -1048,26 +1040,18 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth case ADBC_OBJECT_DEPTH_DB_SCHEMAS: // Return metadata on catalogs and schemas. query = duckdb::StringUtil::Format(R"( - WITH filtered_schemata AS ( + WITH db_schemas AS ( SELECT catalog_name, schema_name, - FROM - information_schema.schemata - WHERE catalog_name NOT IN ('system', 'temp') AND schema_name NOT IN ('information_schema', 'pg_catalog') - ), - db_schemas AS ( - SELECT - * - FROM - filtered_schemata + FROM information_schema.schemata WHERE schema_name LIKE '%s' ) SELECT catalog_name, LIST({ - db_schema_name: dbs.schema_name, + db_schema_name: schema_name, db_schema_tables: []::STRUCT( table_name VARCHAR, table_type VARCHAR, @@ -1099,11 +1083,11 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth constraint_column_usage STRUCT(fk_catalog VARCHAR, fk_db_schema VARCHAR, fk_table VARCHAR, fk_column_name VARCHAR)[] )[] )[], - }) FILTER (dbs.schema_name IS NOT null) AS catalog_db_schemas + }) FILTER (dbs.schema_name is not null) catalog_db_schemas FROM - filtered_schemata + information_schema.schemata LEFT JOIN db_schemas dbs - USING (catalog_name) + USING (catalog_name, schema_name) WHERE catalog_name LIKE '%s' GROUP BY catalog_name )", @@ -1112,15 +1096,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth case ADBC_OBJECT_DEPTH_TABLES: // Return metadata on catalogs, schemas, and tables. query = duckdb::StringUtil::Format(R"( - WITH filtered_schemata AS ( - SELECT - catalog_name, - schema_name, - FROM - information_schema.schemata - WHERE catalog_name NOT IN ('system', 'temp') AND schema_name NOT IN ('information_schema', 'pg_catalog') - ), - tables AS ( + WITH tables AS ( SELECT table_catalog catalog_name, table_schema schema_name, @@ -1164,8 +1140,8 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth catalog_name, schema_name, db_schema_tables, - FROM filtered_schemata fs - LEFT JOIN tables t + FROM information_schema.schemata + LEFT JOIN tables USING (catalog_name, schema_name) WHERE schema_name LIKE '%s' ) @@ -1173,13 +1149,13 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth SELECT catalog_name, LIST({ - db_schema_name: dbs.schema_name, + db_schema_name: schema_name, db_schema_tables: db_schema_tables, - }) FILTER (dbs.schema_name is not null) AS catalog_db_schemas + }) FILTER (dbs.schema_name is not null) catalog_db_schemas FROM - filtered_schemata + information_schema.schemata LEFT JOIN db_schemas dbs - USING (catalog_name) + USING (catalog_name, schema_name) WHERE catalog_name LIKE '%s' GROUP BY catalog_name )", @@ -1188,15 +1164,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth case ADBC_OBJECT_DEPTH_COLUMNS: // Return metadata on catalogs, schemas, tables, and columns. query = duckdb::StringUtil::Format(R"( - WITH filtered_schemata AS ( - SELECT - catalog_name, - schema_name, - FROM - information_schema.schemata - WHERE catalog_name NOT IN ('system', 'temp') AND schema_name NOT IN ('information_schema', 'pg_catalog') - ), - columns AS ( + WITH columns AS ( SELECT table_catalog, table_schema, @@ -1265,8 +1233,8 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth catalog_name, schema_name, db_schema_tables, - FROM filtered_schemata fs - LEFT JOIN tables t + FROM information_schema.schemata + LEFT JOIN tables USING (catalog_name, schema_name) WHERE schema_name LIKE '%s' ) @@ -1274,13 +1242,13 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth SELECT catalog_name, LIST({ - db_schema_name: dbs.schema_name, + db_schema_name: schema_name, db_schema_tables: db_schema_tables, - }) FILTER (dbs.schema_name is not null) AS catalog_db_schemas + }) FILTER (dbs.schema_name is not null) catalog_db_schemas FROM - filtered_schemata + information_schema.schemata LEFT JOIN db_schemas dbs - USING (catalog_name) + USING (catalog_name, schema_name) WHERE catalog_name LIKE '%s' GROUP BY catalog_name )", diff --git a/test/api/adbc/test_adbc.cpp b/test/api/adbc/test_adbc.cpp index fd840bced89f..1aeac5e272f8 100644 --- a/test/api/adbc/test_adbc.cpp +++ b/test/api/adbc/test_adbc.cpp @@ -1064,18 +1064,22 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_CATALOGS, nullptr, nullptr, nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - auto res = db.Query("Select * from result"); + auto res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); - REQUIRE(res->GetValue(0, 0).ToString() == "test_catalog_depth"); + REQUIRE(res->RowCount() == 3); + REQUIRE(res->GetValue(0, 0).ToString() == "system"); + REQUIRE(res->GetValue(0, 1).ToString() == "temp"); + REQUIRE(res->GetValue(0, 2).ToString() == "test_catalog_depth"); REQUIRE(res->GetValue(1, 0).ToString() == "[]"); + REQUIRE(res->GetValue(1, 1).ToString() == "[]"); + REQUIRE(res->GetValue(1, 2).ToString() == "[]"); db.Query("Drop table result;"); // Test Filters AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_CATALOGS, "bla", nullptr, nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); REQUIRE(res->RowCount() == 0); db.Query("Drop table result;"); @@ -1095,25 +1099,39 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_DB_SCHEMAS, nullptr, nullptr, nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - auto res = db.Query("Select * from result"); + auto res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); + REQUIRE(res->RowCount() == 3); REQUIRE(res->GetValue(0, 0).ToString() == "ADBC_OBJECT_DEPTH_DB_SCHEMAS"); + REQUIRE(res->GetValue(0, 1).ToString() == "system"); + REQUIRE(res->GetValue(0, 2).ToString() == "temp"); string expected = R"([ + { + 'db_schema_name': information_schema, + 'db_schema_tables': [] + }, { 'db_schema_name': main, 'db_schema_tables': [] + }, + { + 'db_schema_name': pg_catalog, + 'db_schema_tables': [] } ])"; REQUIRE(StringUtil::Replace(res->GetValue(1, 0).ToString(), " ", "") == StringUtil::Replace(StringUtil::Replace(StringUtil::Replace(expected, "\n", ""), "\t", ""), " ", "")); + REQUIRE(StringUtil::Replace(res->GetValue(1, 1).ToString(), " ", "") == + StringUtil::Replace(StringUtil::Replace(StringUtil::Replace(expected, "\n", ""), "\t", ""), " ", "")); + REQUIRE(StringUtil::Replace(res->GetValue(1, 2).ToString(), " ", "") == + StringUtil::Replace(StringUtil::Replace(StringUtil::Replace(expected, "\n", ""), "\t", ""), " ", "")); db.Query("Drop table result;"); // Test Filters AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_DB_SCHEMAS, "bla", nullptr, nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); REQUIRE(res->RowCount() == 0); db.Query("Drop table result;"); @@ -1121,10 +1139,12 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_DB_SCHEMAS, nullptr, "bla", nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); + REQUIRE(res->RowCount() == 3); REQUIRE(res->GetValue(1, 0).ToString() == "NULL"); + REQUIRE(res->GetValue(1, 1).ToString() == "NULL"); + REQUIRE(res->GetValue(1, 2).ToString() == "NULL"); db.Query("Drop table result;"); } // 3. Test ADBC_OBJECT_DEPTH_TABLES @@ -1141,11 +1161,17 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_TABLES, nullptr, nullptr, nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - auto res = db.Query("Select * from result"); + auto res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); - REQUIRE(res->GetValue(0, 0).ToString() == "test_table_depth"); + REQUIRE(res->RowCount() == 3); + REQUIRE(res->GetValue(0, 0).ToString() == "system"); + REQUIRE(res->GetValue(0, 1).ToString() == "temp"); + REQUIRE(res->GetValue(0, 2).ToString() == "test_table_depth"); string expected = R"([ + { + 'db_schema_name': information_schema, + 'db_schema_tables': NULL + }, { 'db_schema_name': main, 'db_schema_tables': [ @@ -1156,9 +1182,13 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { 'table_constraints': [] } ] + }, + { + 'db_schema_name': pg_catalog, + 'db_schema_tables': NULL } ])"; - REQUIRE(StringUtil::Replace(res->GetValue(1, 0).ToString(), " ", "") == + REQUIRE(StringUtil::Replace(res->GetValue(1, 2).ToString(), " ", "") == StringUtil::Replace(StringUtil::Replace(StringUtil::Replace(expected, "\n", ""), "\t", ""), " ", "")); db.Query("Drop table result;"); @@ -1166,7 +1196,7 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_TABLES, "bla", nullptr, nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); REQUIRE(res->RowCount() == 0); db.Query("Drop table result;"); @@ -1174,19 +1204,21 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_TABLES, nullptr, "bla", nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); + REQUIRE(res->RowCount() == 3); REQUIRE(res->GetValue(1, 0).ToString() == "NULL"); + REQUIRE(res->GetValue(1, 1).ToString() == "NULL"); + REQUIRE(res->GetValue(1, 2).ToString() == "NULL"); db.Query("Drop table result;"); AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_TABLES, nullptr, nullptr, "bla", nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); - REQUIRE(res->GetValue(1, 0).ToString() == "[{'db_schema_name': main, 'db_schema_tables': NULL}]"); + REQUIRE(res->RowCount() == 3); + REQUIRE(res->GetValue(1, 2).ToString() == "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, 'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); db.Query("Drop table result;"); } // 4.Test ADBC_OBJECT_DEPTH_COLUMNS @@ -1203,11 +1235,17 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, nullptr, nullptr, nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - auto res = db.Query("Select * from result"); + auto res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); - REQUIRE(res->GetValue(0, 0).ToString() == "test_column_depth"); + REQUIRE(res->RowCount() == 3); + REQUIRE(res->GetValue(0, 0).ToString() == "system"); + REQUIRE(res->GetValue(0, 1).ToString() == "temp"); + REQUIRE(res->GetValue(0, 2).ToString() == "test_column_depth"); string expected = R"([ + { + 'db_schema_name': information_schema, + 'db_schema_tables': NULL + }, { 'db_schema_name': main, 'db_schema_tables': [ @@ -1240,9 +1278,13 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { 'table_constraints': NULL } ] + }, + { + 'db_schema_name': pg_catalog, + 'db_schema_tables': NULL } ])"; - REQUIRE(StringUtil::Replace(res->GetValue(1, 0).ToString(), " ", "") == + REQUIRE(StringUtil::Replace(res->GetValue(1, 2).ToString(), " ", "") == StringUtil::Replace(StringUtil::Replace(StringUtil::Replace(expected, "\n", ""), "\t", ""), " ", "")); db.Query("Drop table result;"); @@ -1250,7 +1292,7 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, "bla", nullptr, nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); REQUIRE(res->RowCount() == 0); db.Query("Drop table result;"); @@ -1258,30 +1300,32 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, nullptr, "bla", nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); + REQUIRE(res->RowCount() == 3); REQUIRE(res->GetValue(1, 0).ToString() == "NULL"); + REQUIRE(res->GetValue(1, 1).ToString() == "NULL"); + REQUIRE(res->GetValue(1, 2).ToString() == "NULL"); db.Query("Drop table result;"); AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, nullptr, nullptr, "bla", nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); - REQUIRE(res->GetValue(1, 0).ToString() == "[{'db_schema_name': main, 'db_schema_tables': NULL}]"); + REQUIRE(res->RowCount() == 3); + REQUIRE(res->GetValue(1, 2).ToString() == "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, 'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); db.Query("Drop table result;"); AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, nullptr, nullptr, nullptr, nullptr, "bla", &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); - REQUIRE(res->GetValue(1, 0).ToString() == - "[{'db_schema_name': main, 'db_schema_tables': [{'table_name': my_table, 'table_type': BASE TABLE, " - "'table_columns': NULL, 'table_constraints': NULL}]}]"); + REQUIRE(res->RowCount() == 3); + REQUIRE(res->GetValue(1, 2).ToString() == + "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, 'db_schema_tables': [{'table_name': my_table, 'table_type': BASE TABLE, " + "'table_columns': NULL, 'table_constraints': NULL}]}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); db.Query("Drop table result;"); } // 5.Test ADBC_OBJECT_DEPTH_ALL @@ -1298,11 +1342,16 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_ALL, nullptr, nullptr, nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - auto res = db.Query("Select * from result"); - REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); - REQUIRE(res->GetValue(0, 0).ToString() == "test_all_depth"); + auto res = db.Query("Select * from result order by catalog_name asc"); + REQUIRE(res->RowCount() == 3); + REQUIRE(res->GetValue(0, 0).ToString() == "system"); + REQUIRE(res->GetValue(0, 1).ToString() == "temp"); + REQUIRE(res->GetValue(0, 2).ToString() == "test_all_depth"); string expected = R"([ + { + 'db_schema_name': information_schema, + 'db_schema_tables': NULL + }, { 'db_schema_name': main, 'db_schema_tables': [ @@ -1335,9 +1384,13 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { 'table_constraints': NULL } ] + }, + { + 'db_schema_name': pg_catalog, + 'db_schema_tables': NULL } ])"; - REQUIRE(StringUtil::Replace(res->GetValue(1, 0).ToString(), " ", "") == + REQUIRE(StringUtil::Replace(res->GetValue(1, 2).ToString(), " ", "") == StringUtil::Replace(StringUtil::Replace(StringUtil::Replace(expected, "\n", ""), "\t", ""), " ", "")); db.Query("Drop table result;"); @@ -1345,7 +1398,7 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, "bla", nullptr, nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); REQUIRE(res->RowCount() == 0); db.Query("Drop table result;"); @@ -1353,30 +1406,32 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, nullptr, "bla", nullptr, nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); + REQUIRE(res->RowCount() == 3); REQUIRE(res->GetValue(1, 0).ToString() == "NULL"); + REQUIRE(res->GetValue(1, 1).ToString() == "NULL"); + REQUIRE(res->GetValue(1, 2).ToString() == "NULL"); db.Query("Drop table result;"); AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, nullptr, nullptr, "bla", nullptr, nullptr, &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); - REQUIRE(res->GetValue(1, 0).ToString() == "[{'db_schema_name': main, 'db_schema_tables': NULL}]"); + REQUIRE(res->RowCount() == 3); + REQUIRE(res->GetValue(1, 2).ToString() == "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, 'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); db.Query("Drop table result;"); AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, nullptr, nullptr, nullptr, nullptr, "bla", &arrow_stream, &adbc_error); db.CreateTable("result", arrow_stream); - res = db.Query("Select * from result"); + res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); - REQUIRE(res->RowCount() == 1); - REQUIRE(res->GetValue(1, 0).ToString() == - "[{'db_schema_name': main, 'db_schema_tables': [{'table_name': my_table, 'table_type': BASE TABLE, " - "'table_columns': NULL, 'table_constraints': NULL}]}]"); + REQUIRE(res->RowCount() == 3); + REQUIRE(res->GetValue(1, 2).ToString() == + "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, 'db_schema_tables': [{'table_name': my_table, 'table_type': BASE TABLE, " + "'table_columns': NULL, 'table_constraints': NULL}]}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); db.Query("Drop table result;"); } // Now lets test some errors diff --git a/tools/pythonpkg/tests/fast/adbc/test_adbc.py b/tools/pythonpkg/tests/fast/adbc/test_adbc.py index e0e0ee3830b4..708e00a94d36 100644 --- a/tools/pythonpkg/tests/fast/adbc/test_adbc.py +++ b/tools/pythonpkg/tests/fast/adbc/test_adbc.py @@ -47,10 +47,48 @@ def test_connection_get_objects(duck_conn): with duck_conn.cursor() as cursor: cursor.execute("CREATE TABLE getobjects (ints BIGINT PRIMARY KEY)") depth_all = duck_conn.adbc_get_objects(depth="all").read_all() - assert depth_all.to_pylist() == [ + assert sorted_get_objects(depth_all.to_pylist()) == sorted_get_objects([ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, { 'catalog_name': 'memory', 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, { 'db_schema_name': 'main', 'db_schema_tables': [ @@ -97,15 +135,61 @@ def test_connection_get_objects(duck_conn): } ], }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, ], } - ] + ]) depth_tables = duck_conn.adbc_get_objects(depth="tables").read_all() - assert depth_tables.to_pylist() == [ + assert sorted_get_objects(depth_tables.to_pylist()) == sorted_get_objects([ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, { 'catalog_name': 'memory', 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, { 'db_schema_name': 'main', 'db_schema_tables': [ @@ -119,28 +203,78 @@ def test_connection_get_objects(duck_conn): }, ], } - ] + ]) depth_db_schemas = duck_conn.adbc_get_objects(depth="db_schemas").read_all() - assert depth_db_schemas.to_pylist() == [ + assert sorted_get_objects(depth_db_schemas.to_pylist()) == sorted_get_objects([ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, { 'catalog_name': 'memory', 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, { 'db_schema_name': 'main', 'db_schema_tables': [], }, ], } - ] + ]) depth_catalogs = duck_conn.adbc_get_objects(depth="catalogs").read_all() - assert depth_catalogs.to_pylist() == [ + assert sorted_get_objects(depth_catalogs.to_pylist()) == sorted_get_objects([ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [], + }, { 'catalog_name': 'memory', 'catalog_db_schemas': [], - } - ] + }, + ]) # All result schemas should be the same assert depth_all.schema == depth_tables.schema @@ -153,10 +287,48 @@ def test_connection_get_objects_filters(duck_conn): cursor.execute("CREATE TABLE getobjects (ints BIGINT PRIMARY KEY)") no_filter = duck_conn.adbc_get_objects(depth="all").read_all() - assert no_filter.to_pylist() == [ + assert sorted_get_objects(no_filter.to_pylist()) == sorted_get_objects([ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, { 'catalog_name': 'memory', 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, { 'db_schema_name': 'main', 'db_schema_tables': [ @@ -203,15 +375,61 @@ def test_connection_get_objects_filters(duck_conn): } ], }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, ], } - ] + ]) column_filter = duck_conn.adbc_get_objects(depth="all", column_name_filter="notexist").read_all() - assert column_filter.to_pylist() == [ + assert sorted_get_objects(column_filter.to_pylist()) == sorted_get_objects([ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, { 'catalog_name': 'memory', 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, { 'db_schema_name': 'main', 'db_schema_tables': [ @@ -238,28 +456,78 @@ def test_connection_get_objects_filters(duck_conn): }, ], } - ] + ]) table_name_filter = duck_conn.adbc_get_objects(depth="all", table_name_filter="notexist").read_all() - assert table_name_filter.to_pylist() == [ + assert sorted_get_objects(table_name_filter.to_pylist()) == sorted_get_objects([ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, { 'catalog_name': 'memory', 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, { 'db_schema_name': 'main', 'db_schema_tables': None, }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, ], } - ] + ]) db_schema_filter = duck_conn.adbc_get_objects(depth="all", db_schema_filter="notexist").read_all() - assert db_schema_filter.to_pylist() == [ + assert sorted_get_objects(db_schema_filter.to_pylist()) == sorted_get_objects([ + { + 'catalog_name': 'system', + 'catalog_db_schemas': None, + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': None, + }, { 'catalog_name': 'memory', 'catalog_db_schemas': None, } - ] + ]) catalog_filter = duck_conn.adbc_get_objects(depth="all", catalog_filter="notexist").read_all() assert catalog_filter.to_pylist() == [] @@ -450,3 +718,37 @@ def test_read(duck_conn): datetime.datetime(2006, 2, 15, 4, 46, 27), ], } + +def sorted_get_objects(catalogs): + res = [] + for catalog in sorted(catalogs, key=lambda cat: cat['catalog_name']): + new_catalog = { + "catalog_name": catalog['catalog_name'], + "catalog_db_schemas": [], + } + + for db_schema in sorted(catalog['catalog_db_schemas'] or [], key=lambda sch: sch['db_schema_name']): + new_db_schema = { + "db_schema_name": db_schema['db_schema_name'], + "db_schema_tables": [], + } + + for table in sorted(db_schema['db_schema_tables'] or [], key=lambda tab: tab['table_name']): + new_table = { + "table_name": table['table_name'], + "table_type": table['table_type'], + "table_columns": [], + "table_constraints": [], + } + + for column in sorted(table['table_columns'] or [], key=lambda col: col['ordinal_position']): + new_table["table_columns"].append(column) + + for constraint in sorted(table['table_constraints'] or [], key=lambda con: con['constraint_name']): + new_table["table_constraints"].append(constraint) + + new_db_schema["db_schema_tables"].append(new_table) + new_catalog["catalog_db_schemas"].append(new_db_schema) + res.append(new_catalog) + + return res \ No newline at end of file From 3ff7cb2a63f1f9d2bb2fd427beefab07674b29af Mon Sep 17 00:00:00 2001 From: Joel Lubinitsky Date: Thu, 11 Apr 2024 11:07:32 -0400 Subject: [PATCH 137/147] run formatter --- test/api/adbc/test_adbc.cpp | 24 +- tools/pythonpkg/tests/fast/adbc/test_adbc.py | 943 ++++++++++--------- 2 files changed, 497 insertions(+), 470 deletions(-) diff --git a/test/api/adbc/test_adbc.cpp b/test/api/adbc/test_adbc.cpp index 1aeac5e272f8..d612a43d7fc6 100644 --- a/test/api/adbc/test_adbc.cpp +++ b/test/api/adbc/test_adbc.cpp @@ -1218,7 +1218,9 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); REQUIRE(res->RowCount() == 3); - REQUIRE(res->GetValue(1, 2).ToString() == "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, 'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); + REQUIRE(res->GetValue(1, 2).ToString() == + "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, " + "'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); db.Query("Drop table result;"); } // 4.Test ADBC_OBJECT_DEPTH_COLUMNS @@ -1314,7 +1316,9 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); REQUIRE(res->RowCount() == 3); - REQUIRE(res->GetValue(1, 2).ToString() == "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, 'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); + REQUIRE(res->GetValue(1, 2).ToString() == + "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, " + "'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); db.Query("Drop table result;"); AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, nullptr, nullptr, nullptr, nullptr, @@ -1324,8 +1328,10 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { REQUIRE(res->ColumnCount() == 2); REQUIRE(res->RowCount() == 3); REQUIRE(res->GetValue(1, 2).ToString() == - "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, 'db_schema_tables': [{'table_name': my_table, 'table_type': BASE TABLE, " - "'table_columns': NULL, 'table_constraints': NULL}]}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); + "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, " + "'db_schema_tables': [{'table_name': my_table, 'table_type': BASE TABLE, " + "'table_columns': NULL, 'table_constraints': NULL}]}, {'db_schema_name': pg_catalog, " + "'db_schema_tables': NULL}]"); db.Query("Drop table result;"); } // 5.Test ADBC_OBJECT_DEPTH_ALL @@ -1420,7 +1426,9 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { res = db.Query("Select * from result order by catalog_name asc"); REQUIRE(res->ColumnCount() == 2); REQUIRE(res->RowCount() == 3); - REQUIRE(res->GetValue(1, 2).ToString() == "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, 'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); + REQUIRE(res->GetValue(1, 2).ToString() == + "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, " + "'db_schema_tables': NULL}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); db.Query("Drop table result;"); AdbcConnectionGetObjects(&db.adbc_connection, ADBC_OBJECT_DEPTH_COLUMNS, nullptr, nullptr, nullptr, nullptr, @@ -1430,8 +1438,10 @@ TEST_CASE("Test AdbcConnectionGetObjects", "[adbc]") { REQUIRE(res->ColumnCount() == 2); REQUIRE(res->RowCount() == 3); REQUIRE(res->GetValue(1, 2).ToString() == - "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, 'db_schema_tables': [{'table_name': my_table, 'table_type': BASE TABLE, " - "'table_columns': NULL, 'table_constraints': NULL}]}, {'db_schema_name': pg_catalog, 'db_schema_tables': NULL}]"); + "[{'db_schema_name': information_schema, 'db_schema_tables': NULL}, {'db_schema_name': main, " + "'db_schema_tables': [{'table_name': my_table, 'table_type': BASE TABLE, " + "'table_columns': NULL, 'table_constraints': NULL}]}, {'db_schema_name': pg_catalog, " + "'db_schema_tables': NULL}]"); db.Query("Drop table result;"); } // Now lets test some errors diff --git a/tools/pythonpkg/tests/fast/adbc/test_adbc.py b/tools/pythonpkg/tests/fast/adbc/test_adbc.py index 708e00a94d36..9b05345f0bb3 100644 --- a/tools/pythonpkg/tests/fast/adbc/test_adbc.py +++ b/tools/pythonpkg/tests/fast/adbc/test_adbc.py @@ -47,234 +47,242 @@ def test_connection_get_objects(duck_conn): with duck_conn.cursor() as cursor: cursor.execute("CREATE TABLE getobjects (ints BIGINT PRIMARY KEY)") depth_all = duck_conn.adbc_get_objects(depth="all").read_all() - assert sorted_get_objects(depth_all.to_pylist()) == sorted_get_objects([ - { - 'catalog_name': 'system', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'temp', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'memory', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': [ - { - 'table_name': 'getobjects', - 'table_type': 'BASE TABLE', - 'table_columns': [ - { - 'column_name': 'ints', - 'ordinal_position': 1, - 'remarks': '', - 'xdbc_char_octet_length': None, - 'xdbc_column_def': None, - 'xdbc_column_size': None, - 'xdbc_data_type': None, - 'xdbc_datetime_sub': None, - 'xdbc_decimal_digits': None, - 'xdbc_is_autoincrement': None, - 'xdbc_is_generatedcolumn': None, - 'xdbc_is_nullable': None, - 'xdbc_nullable': None, - 'xdbc_num_prec_radix': None, - 'xdbc_scope_catalog': None, - 'xdbc_scope_schema': None, - 'xdbc_scope_table': None, - 'xdbc_sql_data_type': None, - 'xdbc_type_name': None, - }, - ], - 'table_constraints': [ - { - 'constraint_column_names': [], - 'constraint_column_usage': [], - 'constraint_name': 'getobjects_ints_pkey', - 'constraint_type': 'PRIMARY KEY', - }, - { - 'constraint_column_names': [], - 'constraint_column_usage': [], - 'constraint_name': 'getobjects_ints_not_null', - 'constraint_type': 'CHECK', - }, - ], - } - ], - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - } - ]) + assert sorted_get_objects(depth_all.to_pylist()) == sorted_get_objects( + [ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'memory', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': [ + { + 'table_name': 'getobjects', + 'table_type': 'BASE TABLE', + 'table_columns': [ + { + 'column_name': 'ints', + 'ordinal_position': 1, + 'remarks': '', + 'xdbc_char_octet_length': None, + 'xdbc_column_def': None, + 'xdbc_column_size': None, + 'xdbc_data_type': None, + 'xdbc_datetime_sub': None, + 'xdbc_decimal_digits': None, + 'xdbc_is_autoincrement': None, + 'xdbc_is_generatedcolumn': None, + 'xdbc_is_nullable': None, + 'xdbc_nullable': None, + 'xdbc_num_prec_radix': None, + 'xdbc_scope_catalog': None, + 'xdbc_scope_schema': None, + 'xdbc_scope_table': None, + 'xdbc_sql_data_type': None, + 'xdbc_type_name': None, + }, + ], + 'table_constraints': [ + { + 'constraint_column_names': [], + 'constraint_column_usage': [], + 'constraint_name': 'getobjects_ints_pkey', + 'constraint_type': 'PRIMARY KEY', + }, + { + 'constraint_column_names': [], + 'constraint_column_usage': [], + 'constraint_name': 'getobjects_ints_not_null', + 'constraint_type': 'CHECK', + }, + ], + } + ], + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + ] + ) depth_tables = duck_conn.adbc_get_objects(depth="tables").read_all() - assert sorted_get_objects(depth_tables.to_pylist()) == sorted_get_objects([ - { - 'catalog_name': 'system', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'temp', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'memory', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': [ - { - 'table_name': 'getobjects', - 'table_type': 'BASE TABLE', - 'table_columns': [], - 'table_constraints': [], - } - ], - }, - ], - } - ]) + assert sorted_get_objects(depth_tables.to_pylist()) == sorted_get_objects( + [ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'memory', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': [ + { + 'table_name': 'getobjects', + 'table_type': 'BASE TABLE', + 'table_columns': [], + 'table_constraints': [], + } + ], + }, + ], + }, + ] + ) depth_db_schemas = duck_conn.adbc_get_objects(depth="db_schemas").read_all() - assert sorted_get_objects(depth_db_schemas.to_pylist()) == sorted_get_objects([ - { - 'catalog_name': 'system', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'temp', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'memory', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': [], - }, - ], - } - ]) + assert sorted_get_objects(depth_db_schemas.to_pylist()) == sorted_get_objects( + [ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'memory', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': [], + }, + ], + }, + ] + ) depth_catalogs = duck_conn.adbc_get_objects(depth="catalogs").read_all() - assert sorted_get_objects(depth_catalogs.to_pylist()) == sorted_get_objects([ - { - 'catalog_name': 'system', - 'catalog_db_schemas': [], - }, - { - 'catalog_name': 'temp', - 'catalog_db_schemas': [], - }, - { - 'catalog_name': 'memory', - 'catalog_db_schemas': [], - }, - ]) + assert sorted_get_objects(depth_catalogs.to_pylist()) == sorted_get_objects( + [ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [], + }, + { + 'catalog_name': 'memory', + 'catalog_db_schemas': [], + }, + ] + ) # All result schemas should be the same assert depth_all.schema == depth_tables.schema @@ -287,247 +295,255 @@ def test_connection_get_objects_filters(duck_conn): cursor.execute("CREATE TABLE getobjects (ints BIGINT PRIMARY KEY)") no_filter = duck_conn.adbc_get_objects(depth="all").read_all() - assert sorted_get_objects(no_filter.to_pylist()) == sorted_get_objects([ - { - 'catalog_name': 'system', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'temp', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'memory', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': [ - { - 'table_name': 'getobjects', - 'table_type': 'BASE TABLE', - 'table_columns': [ - { - 'column_name': 'ints', - 'ordinal_position': 1, - 'remarks': '', - 'xdbc_char_octet_length': None, - 'xdbc_column_def': None, - 'xdbc_column_size': None, - 'xdbc_data_type': None, - 'xdbc_datetime_sub': None, - 'xdbc_decimal_digits': None, - 'xdbc_is_autoincrement': None, - 'xdbc_is_generatedcolumn': None, - 'xdbc_is_nullable': None, - 'xdbc_nullable': None, - 'xdbc_num_prec_radix': None, - 'xdbc_scope_catalog': None, - 'xdbc_scope_schema': None, - 'xdbc_scope_table': None, - 'xdbc_sql_data_type': None, - 'xdbc_type_name': None, - }, - ], - 'table_constraints': [ - { - 'constraint_column_names': [], - 'constraint_column_usage': [], - 'constraint_name': 'getobjects_ints_pkey', - 'constraint_type': 'PRIMARY KEY', - }, - { - 'constraint_column_names': [], - 'constraint_column_usage': [], - 'constraint_name': 'getobjects_ints_not_null', - 'constraint_type': 'CHECK', - }, - ], - } - ], - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - } - ]) + assert sorted_get_objects(no_filter.to_pylist()) == sorted_get_objects( + [ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'memory', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': [ + { + 'table_name': 'getobjects', + 'table_type': 'BASE TABLE', + 'table_columns': [ + { + 'column_name': 'ints', + 'ordinal_position': 1, + 'remarks': '', + 'xdbc_char_octet_length': None, + 'xdbc_column_def': None, + 'xdbc_column_size': None, + 'xdbc_data_type': None, + 'xdbc_datetime_sub': None, + 'xdbc_decimal_digits': None, + 'xdbc_is_autoincrement': None, + 'xdbc_is_generatedcolumn': None, + 'xdbc_is_nullable': None, + 'xdbc_nullable': None, + 'xdbc_num_prec_radix': None, + 'xdbc_scope_catalog': None, + 'xdbc_scope_schema': None, + 'xdbc_scope_table': None, + 'xdbc_sql_data_type': None, + 'xdbc_type_name': None, + }, + ], + 'table_constraints': [ + { + 'constraint_column_names': [], + 'constraint_column_usage': [], + 'constraint_name': 'getobjects_ints_pkey', + 'constraint_type': 'PRIMARY KEY', + }, + { + 'constraint_column_names': [], + 'constraint_column_usage': [], + 'constraint_name': 'getobjects_ints_not_null', + 'constraint_type': 'CHECK', + }, + ], + } + ], + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + ] + ) column_filter = duck_conn.adbc_get_objects(depth="all", column_name_filter="notexist").read_all() - assert sorted_get_objects(column_filter.to_pylist()) == sorted_get_objects([ - { - 'catalog_name': 'system', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'temp', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'memory', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': [ - { - 'table_name': 'getobjects', - 'table_type': 'BASE TABLE', - 'table_columns': None, - 'table_constraints': [ - { - 'constraint_column_names': [], - 'constraint_column_usage': [], - 'constraint_name': 'getobjects_ints_pkey', - 'constraint_type': 'PRIMARY KEY', - }, - { - 'constraint_column_names': [], - 'constraint_column_usage': [], - 'constraint_name': 'getobjects_ints_not_null', - 'constraint_type': 'CHECK', - }, - ], - } - ], - }, - ], - } - ]) + assert sorted_get_objects(column_filter.to_pylist()) == sorted_get_objects( + [ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'memory', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': [ + { + 'table_name': 'getobjects', + 'table_type': 'BASE TABLE', + 'table_columns': None, + 'table_constraints': [ + { + 'constraint_column_names': [], + 'constraint_column_usage': [], + 'constraint_name': 'getobjects_ints_pkey', + 'constraint_type': 'PRIMARY KEY', + }, + { + 'constraint_column_names': [], + 'constraint_column_usage': [], + 'constraint_name': 'getobjects_ints_not_null', + 'constraint_type': 'CHECK', + }, + ], + } + ], + }, + ], + }, + ] + ) table_name_filter = duck_conn.adbc_get_objects(depth="all", table_name_filter="notexist").read_all() - assert sorted_get_objects(table_name_filter.to_pylist()) == sorted_get_objects([ - { - 'catalog_name': 'system', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'temp', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - }, - { - 'catalog_name': 'memory', - 'catalog_db_schemas': [ - { - 'db_schema_name': 'information_schema', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'main', - 'db_schema_tables': None, - }, - { - 'db_schema_name': 'pg_catalog', - 'db_schema_tables': None, - }, - ], - } - ]) + assert sorted_get_objects(table_name_filter.to_pylist()) == sorted_get_objects( + [ + { + 'catalog_name': 'system', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + { + 'catalog_name': 'memory', + 'catalog_db_schemas': [ + { + 'db_schema_name': 'information_schema', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'main', + 'db_schema_tables': None, + }, + { + 'db_schema_name': 'pg_catalog', + 'db_schema_tables': None, + }, + ], + }, + ] + ) db_schema_filter = duck_conn.adbc_get_objects(depth="all", db_schema_filter="notexist").read_all() - assert sorted_get_objects(db_schema_filter.to_pylist()) == sorted_get_objects([ - { - 'catalog_name': 'system', - 'catalog_db_schemas': None, - }, - { - 'catalog_name': 'temp', - 'catalog_db_schemas': None, - }, - { - 'catalog_name': 'memory', - 'catalog_db_schemas': None, - } - ]) + assert sorted_get_objects(db_schema_filter.to_pylist()) == sorted_get_objects( + [ + { + 'catalog_name': 'system', + 'catalog_db_schemas': None, + }, + { + 'catalog_name': 'temp', + 'catalog_db_schemas': None, + }, + { + 'catalog_name': 'memory', + 'catalog_db_schemas': None, + }, + ] + ) catalog_filter = duck_conn.adbc_get_objects(depth="all", catalog_filter="notexist").read_all() assert catalog_filter.to_pylist() == [] @@ -719,6 +735,7 @@ def test_read(duck_conn): ], } + def sorted_get_objects(catalogs): res = [] for catalog in sorted(catalogs, key=lambda cat: cat['catalog_name']): @@ -726,13 +743,13 @@ def sorted_get_objects(catalogs): "catalog_name": catalog['catalog_name'], "catalog_db_schemas": [], } - + for db_schema in sorted(catalog['catalog_db_schemas'] or [], key=lambda sch: sch['db_schema_name']): new_db_schema = { "db_schema_name": db_schema['db_schema_name'], "db_schema_tables": [], } - + for table in sorted(db_schema['db_schema_tables'] or [], key=lambda tab: tab['table_name']): new_table = { "table_name": table['table_name'], @@ -740,15 +757,15 @@ def sorted_get_objects(catalogs): "table_columns": [], "table_constraints": [], } - + for column in sorted(table['table_columns'] or [], key=lambda col: col['ordinal_position']): new_table["table_columns"].append(column) - + for constraint in sorted(table['table_constraints'] or [], key=lambda con: con['constraint_name']): new_table["table_constraints"].append(constraint) - + new_db_schema["db_schema_tables"].append(new_table) new_catalog["catalog_db_schemas"].append(new_db_schema) res.append(new_catalog) - return res \ No newline at end of file + return res From a2b532266b55c77e7ccd6f1851ae2e66c133eee1 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 18:00:55 +0200 Subject: [PATCH 138/147] Use references in subquery flattening code --- .../subquery/flatten_dependent_join.hpp | 4 +- .../binder/query_node/plan_subquery.cpp | 18 +++---- .../subquery/flatten_dependent_join.cpp | 53 +++++++++---------- 3 files changed, 37 insertions(+), 38 deletions(-) diff --git a/src/include/duckdb/planner/subquery/flatten_dependent_join.hpp b/src/include/duckdb/planner/subquery/flatten_dependent_join.hpp index efc41cf648a5..991e084c42ab 100644 --- a/src/include/duckdb/planner/subquery/flatten_dependent_join.hpp +++ b/src/include/duckdb/planner/subquery/flatten_dependent_join.hpp @@ -23,7 +23,7 @@ struct FlattenDependentJoins { //! Detects which Logical Operators have correlated expressions that they are dependent upon, filling the //! has_correlated_expressions map. - bool DetectCorrelatedExpressions(LogicalOperator *op, bool lateral = false, idx_t lateral_depth = 0); + bool DetectCorrelatedExpressions(LogicalOperator &op, bool lateral = false, idx_t lateral_depth = 0); //! Mark entire subtree of Logical Operators as correlated by adding them to the has_correlated_expressions map. bool MarkSubtreeCorrelated(LogicalOperator &op); @@ -35,7 +35,7 @@ struct FlattenDependentJoins { ColumnBinding base_binding; idx_t delim_offset; idx_t data_offset; - unordered_map has_correlated_expressions; + reference_map_t has_correlated_expressions; column_binding_map_t correlated_map; column_binding_map_t replacement_map; const vector &correlated_columns; diff --git a/src/planner/binder/query_node/plan_subquery.cpp b/src/planner/binder/query_node/plan_subquery.cpp index 7370457a6031..29e8f36c3063 100644 --- a/src/planner/binder/query_node/plan_subquery.cpp +++ b/src/planner/binder/query_node/plan_subquery.cpp @@ -254,7 +254,7 @@ static unique_ptr PlanCorrelatedSubquery(Binder &binder, BoundSubque FlattenDependentJoins flatten(binder, correlated_columns, perform_delim); // first we check which logical operators have correlated expressions in the first place - flatten.DetectCorrelatedExpressions(plan.get()); + flatten.DetectCorrelatedExpressions(*plan); // now we push the dependent join down auto dependent_join = flatten.PushDownDependentJoin(std::move(plan)); @@ -279,7 +279,7 @@ static unique_ptr PlanCorrelatedSubquery(Binder &binder, BoundSubque delim_join->mark_index = mark_index; // RHS FlattenDependentJoins flatten(binder, correlated_columns, perform_delim, true); - flatten.DetectCorrelatedExpressions(plan.get()); + flatten.DetectCorrelatedExpressions(*plan); auto dependent_join = flatten.PushDownDependentJoin(std::move(plan)); // fetch the set of columns @@ -307,7 +307,7 @@ static unique_ptr PlanCorrelatedSubquery(Binder &binder, BoundSubque delim_join->mark_index = mark_index; // RHS FlattenDependentJoins flatten(binder, correlated_columns, true, true); - flatten.DetectCorrelatedExpressions(plan.get()); + flatten.DetectCorrelatedExpressions(*plan); auto dependent_join = flatten.PushDownDependentJoin(std::move(plan)); // fetch the columns @@ -411,7 +411,7 @@ void Binder::PlanSubqueries(unique_ptr &expr_ptr, unique_ptr Binder::PlanLateralJoin(unique_ptr left, unique_ptr right, - vector &correlated_columns, + vector &correlated, JoinType join_type, unique_ptr condition) { // scan the right operator for correlated columns // correlated LATERAL JOIN @@ -423,13 +423,13 @@ unique_ptr Binder::PlanLateralJoin(unique_ptr arbitrary_expressions); } - auto perform_delim = PerformDuplicateElimination(*this, correlated_columns); - auto delim_join = CreateDuplicateEliminatedJoin(correlated_columns, join_type, std::move(left), perform_delim); + auto perform_delim = PerformDuplicateElimination(*this, correlated); + auto delim_join = CreateDuplicateEliminatedJoin(correlated, join_type, std::move(left), perform_delim); - FlattenDependentJoins flatten(*this, correlated_columns, perform_delim); + FlattenDependentJoins flatten(*this, correlated, perform_delim); // first we check which logical operators have correlated expressions in the first place - flatten.DetectCorrelatedExpressions(right.get(), true); + flatten.DetectCorrelatedExpressions(*right, true); // now we push the dependent join down auto dependent_join = flatten.PushDownDependentJoin(std::move(right)); @@ -448,7 +448,7 @@ unique_ptr Binder::PlanLateralJoin(unique_ptr D_ASSERT(delim_join->conditions.empty()); delim_join->conditions = std::move(conditions); // then add the delim join conditions - CreateDelimJoinConditions(*delim_join, correlated_columns, plan_columns, flatten.delim_offset, perform_delim); + CreateDelimJoinConditions(*delim_join, correlated, plan_columns, flatten.delim_offset, perform_delim); delim_join->AddChild(std::move(dependent_join)); // check if there are any arbitrary expressions left diff --git a/src/planner/subquery/flatten_dependent_join.cpp b/src/planner/subquery/flatten_dependent_join.cpp index 4e5623cab48d..7e863703472c 100644 --- a/src/planner/subquery/flatten_dependent_join.cpp +++ b/src/planner/subquery/flatten_dependent_join.cpp @@ -27,21 +27,20 @@ FlattenDependentJoins::FlattenDependentJoins(Binder &binder, const vectortype == LogicalOperatorType::LOGICAL_DEPENDENT_JOIN) { + if (op.type == LogicalOperatorType::LOGICAL_DEPENDENT_JOIN) { is_lateral_join = true; } HasCorrelatedExpressions visitor(correlated_columns, lateral, lateral_depth); - visitor.VisitOperator(*op); + visitor.VisitOperator(op); bool has_correlation = visitor.has_correlated_expressions; int child_idx = 0; // now visit the children of this entry and check if they have correlated expressions - for (auto &child : op->children) { + for (auto &child : op.children) { auto new_lateral_depth = lateral_depth; if (is_lateral_join && child_idx == 1) { new_lateral_depth = lateral_depth + 1; @@ -49,7 +48,7 @@ bool FlattenDependentJoins::DetectCorrelatedExpressions(LogicalOperator *op, boo // we OR the property with its children such that has_correlation is true if either // (1) this node has a correlated expression or // (2) one of its children has a correlated expression - if (DetectCorrelatedExpressions(child.get(), lateral, new_lateral_depth)) { + if (DetectCorrelatedExpressions(*child, lateral, new_lateral_depth)) { has_correlation = true; } child_idx++; @@ -60,10 +59,10 @@ bool FlattenDependentJoins::DetectCorrelatedExpressions(LogicalOperator *op, boo // If we detect correlation in a materialized or recursive CTE, the entire right side of the operator // needs to be marked as correlated. Otherwise, function PushDownDependentJoinInternal does not do the // right thing. - if (op->type == LogicalOperatorType::LOGICAL_MATERIALIZED_CTE || - op->type == LogicalOperatorType::LOGICAL_RECURSIVE_CTE) { + if (op.type == LogicalOperatorType::LOGICAL_MATERIALIZED_CTE || + op.type == LogicalOperatorType::LOGICAL_RECURSIVE_CTE) { if (has_correlation) { - MarkSubtreeCorrelated(*op->children[1].get()); + MarkSubtreeCorrelated(*op.children[1].get()); } } return has_correlation; @@ -71,7 +70,7 @@ bool FlattenDependentJoins::DetectCorrelatedExpressions(LogicalOperator *op, boo bool FlattenDependentJoins::MarkSubtreeCorrelated(LogicalOperator &op) { // Do not mark base table scans as correlated - auto entry = has_correlated_expressions.find(&op); + auto entry = has_correlated_expressions.find(op); D_ASSERT(entry != has_correlated_expressions.end()); bool has_correlation = entry->second; for (auto &child : op.children) { @@ -79,10 +78,10 @@ bool FlattenDependentJoins::MarkSubtreeCorrelated(LogicalOperator &op) { } if (op.type != LogicalOperatorType::LOGICAL_GET || op.children.size() == 1) { if (op.type == LogicalOperatorType::LOGICAL_CTE_REF) { - has_correlated_expressions[&op] = true; + has_correlated_expressions[op] = true; return true; } else { - has_correlated_expressions[&op] = has_correlation; + has_correlated_expressions[op] = has_correlation; } } return has_correlation; @@ -99,17 +98,17 @@ unique_ptr FlattenDependentJoins::PushDownDependentJoin(unique_ return result; } -bool SubqueryDependentFilter(Expression *expr) { - if (expr->expression_class == ExpressionClass::BOUND_CONJUNCTION && - expr->GetExpressionType() == ExpressionType::CONJUNCTION_AND) { - auto &bound_conjuction = expr->Cast(); +bool SubqueryDependentFilter(Expression &expr) { + if (expr.expression_class == ExpressionClass::BOUND_CONJUNCTION && + expr.GetExpressionType() == ExpressionType::CONJUNCTION_AND) { + auto &bound_conjuction = expr.Cast(); for (auto &child : bound_conjuction.children) { - if (SubqueryDependentFilter(child.get())) { + if (SubqueryDependentFilter(*child)) { return true; } } } - if (expr->expression_class == ExpressionClass::BOUND_SUBQUERY) { + if (expr.expression_class == ExpressionClass::BOUND_SUBQUERY) { return true; } return false; @@ -119,7 +118,7 @@ unique_ptr FlattenDependentJoins::PushDownDependentJoinInternal bool &parent_propagate_null_values, idx_t lateral_depth) { // first check if the logical operator has correlated expressions - auto entry = has_correlated_expressions.find(plan.get()); + auto entry = has_correlated_expressions.find(*plan); D_ASSERT(entry != has_correlated_expressions.end()); if (!entry->second) { // we reached a node without correlated expressions @@ -151,7 +150,7 @@ unique_ptr FlattenDependentJoins::PushDownDependentJoinInternal // filter // first we flatten the dependent join in the child of the filter for (auto &expr : plan->expressions) { - any_join |= SubqueryDependentFilter(expr.get()); + any_join |= SubqueryDependentFilter(*expr); } plan->children[0] = PushDownDependentJoinInternal(std::move(plan->children[0]), parent_propagate_null_values, lateral_depth); @@ -288,8 +287,8 @@ unique_ptr FlattenDependentJoins::PushDownDependentJoinInternal case LogicalOperatorType::LOGICAL_CROSS_PRODUCT: { // cross product // push into both sides of the plan - bool left_has_correlation = has_correlated_expressions.find(plan->children[0].get())->second; - bool right_has_correlation = has_correlated_expressions.find(plan->children[1].get())->second; + bool left_has_correlation = has_correlated_expressions.find(*plan->children[0])->second; + bool right_has_correlation = has_correlated_expressions.find(*plan->children[1])->second; if (!right_has_correlation) { // only left has correlation: push into left plan->children[0] = PushDownDependentJoinInternal(std::move(plan->children[0]), @@ -350,8 +349,8 @@ unique_ptr FlattenDependentJoins::PushDownDependentJoinInternal auto &join = plan->Cast(); D_ASSERT(plan->children.size() == 2); // check the correlated expressions in the children of the join - bool left_has_correlation = has_correlated_expressions.find(plan->children[0].get())->second; - bool right_has_correlation = has_correlated_expressions.find(plan->children[1].get())->second; + bool left_has_correlation = has_correlated_expressions.find(*plan->children[0])->second; + bool right_has_correlation = has_correlated_expressions.find(*plan->children[1])->second; if (join.join_type == JoinType::INNER) { // inner join @@ -433,12 +432,12 @@ unique_ptr FlattenDependentJoins::PushDownDependentJoinInternal auto &comparison_join = join.Cast(); comparison_join.conditions.push_back(std::move(cond)); } else { - auto &any_join = join.Cast(); + auto &logical_any_join = join.Cast(); auto comparison = make_uniq(ExpressionType::COMPARE_NOT_DISTINCT_FROM, std::move(left), std::move(right)); auto conjunction = make_uniq( - ExpressionType::CONJUNCTION_AND, std::move(comparison), std::move(any_join.condition)); - any_join.condition = std::move(conjunction); + ExpressionType::CONJUNCTION_AND, std::move(comparison), std::move(logical_any_join.condition)); + logical_any_join.condition = std::move(conjunction); } } // then we replace any correlated expressions with the corresponding entry in the correlated_map From 3516894d197ef49604e5337da2d28144f355770f Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 20:06:21 +0200 Subject: [PATCH 139/147] For ExpressionDepthReducerRecursive - correctly recurse into nested subqueries --- .../expression_binder/lateral_binder.cpp | 16 +++++------ .../subquery/lateral/lateral_fuzzer_1463.test | 28 +++++++++++++++++++ 2 files changed, 36 insertions(+), 8 deletions(-) create mode 100644 test/sql/subquery/lateral/lateral_fuzzer_1463.test diff --git a/src/planner/expression_binder/lateral_binder.cpp b/src/planner/expression_binder/lateral_binder.cpp index 13e3ae59980a..e58d78afd0b7 100644 --- a/src/planner/expression_binder/lateral_binder.cpp +++ b/src/planner/expression_binder/lateral_binder.cpp @@ -81,11 +81,6 @@ static void ReduceColumnDepth(vector &columns, } } -static void ReduceExpressionSubquery(BoundSubqueryExpression &expr, - const vector &correlated_columns) { - ReduceColumnDepth(expr.binder->correlated_columns, correlated_columns); -} - class ExpressionDepthReducerRecursive : public BoundNodeVisitor { public: explicit ExpressionDepthReducerRecursive(const vector &correlated) @@ -111,6 +106,13 @@ class ExpressionDepthReducerRecursive : public BoundNodeVisitor { BoundNodeVisitor::VisitBoundTableRef(ref); } + static void ReduceExpressionSubquery(BoundSubqueryExpression &expr, + const vector &correlated_columns) { + ReduceColumnDepth(expr.binder->correlated_columns, correlated_columns); + ExpressionDepthReducerRecursive recursive(correlated_columns); + recursive.VisitBoundQueryNode(*expr.subquery); + } + private: const vector &correlated_columns; }; @@ -127,9 +129,7 @@ class ExpressionDepthReducer : public LogicalOperatorVisitor { } unique_ptr VisitReplace(BoundSubqueryExpression &expr, unique_ptr *expr_ptr) override { - ReduceExpressionSubquery(expr, correlated_columns); - ExpressionDepthReducerRecursive recursive(correlated_columns); - recursive.VisitBoundQueryNode(*expr.subquery); + ExpressionDepthReducerRecursive::ReduceExpressionSubquery(expr, correlated_columns); return nullptr; } diff --git a/test/sql/subquery/lateral/lateral_fuzzer_1463.test b/test/sql/subquery/lateral/lateral_fuzzer_1463.test new file mode 100644 index 000000000000..003bfd424411 --- /dev/null +++ b/test/sql/subquery/lateral/lateral_fuzzer_1463.test @@ -0,0 +1,28 @@ +# name: test/sql/subquery/lateral/lateral_fuzzer_1463.test +# description: Test case for fuzzer issue 1463: Expression with depth > 1 detected in non-lateral join +# group: [lateral] + +query II +SELECT * +FROM + (SELECT 42 AS c1) AS ref, + (SELECT a + b + 1 + FROM + (SELECT 1) t1(a), + (SELECT (SELECT (SELECT ref.c1 + 1)) + 1) t2(b) + ) +; +---- +42 46 + +# postgres compatible variant +query I +SELECT NULL +FROM + (SELECT 42 AS c1) AS ref, + LATERAL (SELECT NULL + FROM + (SELECT NULL) AS r2, + (SELECT (SELECT (SELECT ref.c1))) AS r3) AS r4; +---- +NULL From 08ac807ee9d39a1179a195060f4cf06023d5d704 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 20:08:22 +0200 Subject: [PATCH 140/147] Fix reduce SQL test --- test/sqlsmith/sql_reduce.test | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/test/sqlsmith/sql_reduce.test b/test/sqlsmith/sql_reduce.test index 73ee30559b1c..7b956e8db065 100644 --- a/test/sqlsmith/sql_reduce.test +++ b/test/sqlsmith/sql_reduce.test @@ -7,18 +7,33 @@ require sqlsmith query I SELECT * FROM reduce_sql_statement('SELECT a, b FROM tbl') ORDER BY 1 ---- +SELECT NULL, b FROM tbl +SELECT NULL, b FROM tbl SELECT a FROM tbl +SELECT a, NULL FROM tbl +SELECT a, NULL FROM tbl SELECT a, b SELECT b FROM tbl query I SELECT * FROM reduce_sql_statement('SELECT a, b FROM tbl WHERE a AND b') ORDER BY 1 ---- +SELECT NULL, b FROM tbl WHERE (a AND b) +SELECT NULL, b FROM tbl WHERE (a AND b) SELECT a FROM tbl WHERE (a AND b) +SELECT a, NULL FROM tbl WHERE (a AND b) +SELECT a, NULL FROM tbl WHERE (a AND b) SELECT a, b FROM tbl +SELECT a, b FROM tbl WHERE (NULL AND b) +SELECT a, b FROM tbl WHERE (NULL AND b) +SELECT a, b FROM tbl WHERE (a AND NULL) +SELECT a, b FROM tbl WHERE (a AND NULL) SELECT a, b FROM tbl WHERE NULL +SELECT a, b FROM tbl WHERE NULL +SELECT a, b FROM tbl WHERE a SELECT a, b FROM tbl WHERE a SELECT a, b FROM tbl WHERE b +SELECT a, b FROM tbl WHERE b SELECT a, b WHERE (a AND b) SELECT b FROM tbl WHERE (a AND b) @@ -29,19 +44,31 @@ INSERT INTO tbl (VALUES (1)) INSERT INTO tbl (VALUES (2)) INSERT INTO tbl SELECT * INSERT INTO tbl SELECT NULL FROM (VALUES (1, 2)) +INSERT INTO tbl SELECT NULL FROM (VALUES (1, 2)) AS valueslist query I SELECT * FROM reduce_sql_statement('UPDATE tbl SET i=3, j=4 WHERE z=5') ORDER BY 1 ---- UPDATE tbl SET i = 3 WHERE (z = 5) UPDATE tbl SET i = 3, j = 4 +UPDATE tbl SET i = 3, j = 4 WHERE (NULL = 5) +UPDATE tbl SET i = 3, j = 4 WHERE 5 UPDATE tbl SET i = 3, j = 4 WHERE NULL +UPDATE tbl SET i = 3, j = 4 WHERE z UPDATE tbl SET j = 4 WHERE (z = 5) query I SELECT * FROM reduce_sql_statement('DELETE FROM a WHERE i >= 2000 AND i < 5000;') ORDER BY 1 ---- DELETE FROM a +DELETE FROM a WHERE ((NULL >= 2000) AND (i < 5000)) +DELETE FROM a WHERE ((i >= 2000) AND (NULL < 5000)) +DELETE FROM a WHERE ((i >= 2000) AND 5000) +DELETE FROM a WHERE ((i >= 2000) AND NULL) +DELETE FROM a WHERE ((i >= 2000) AND i) +DELETE FROM a WHERE (2000 AND (i < 5000)) +DELETE FROM a WHERE (NULL AND (i < 5000)) DELETE FROM a WHERE (i < 5000) DELETE FROM a WHERE (i >= 2000) +DELETE FROM a WHERE (i AND (i < 5000)) DELETE FROM a WHERE NULL From 1795abd305f3c538ab93d70f7e4c0299c340baf6 Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 20:18:19 +0200 Subject: [PATCH 141/147] Format fix --- .../sqlsmith/include/statement_simplifier.hpp | 3 +- extension/sqlsmith/statement_simplifier.cpp | 13 +++++--- scripts/fuzzer_helper.py | 33 +++++++++++++++---- scripts/reduce_sql.py | 32 ++++++++++++++---- scripts/run_fuzzer.py | 29 ++++++++++++---- .../binder/query_node/plan_subquery.cpp | 4 +-- .../expression_binder/lateral_binder.cpp | 2 +- 7 files changed, 87 insertions(+), 29 deletions(-) diff --git a/extension/sqlsmith/include/statement_simplifier.hpp b/extension/sqlsmith/include/statement_simplifier.hpp index 2bdf2715cd18..9e6fa160ede4 100644 --- a/extension/sqlsmith/include/statement_simplifier.hpp +++ b/extension/sqlsmith/include/statement_simplifier.hpp @@ -65,7 +65,8 @@ class StatementSimplifier { void SimplifyExpression(duckdb::unique_ptr &expr); void SimplifyOptionalExpression(duckdb::unique_ptr &expr); void SimplifyChildExpression(duckdb::unique_ptr &expr, unique_ptr &child); - void SimplifyExpressionList(duckdb::unique_ptr &expr, vector> &expression_list); + void SimplifyExpressionList(duckdb::unique_ptr &expr, + vector> &expression_list); void SimplifyExpressionList(vector> &expression_list, bool is_optional = true); void Simplify(CommonTableExpressionMap &cte_map); diff --git a/extension/sqlsmith/statement_simplifier.cpp b/extension/sqlsmith/statement_simplifier.cpp index edfd3437ee70..998d32df9fcc 100644 --- a/extension/sqlsmith/statement_simplifier.cpp +++ b/extension/sqlsmith/statement_simplifier.cpp @@ -142,20 +142,23 @@ void StatementSimplifier::Simplify(QueryNode &node) { SimplifyList(node.modifiers); } -void StatementSimplifier::SimplifyExpressionList(duckdb::unique_ptr &expr, vector> &expression_list) { - for(auto &child : expression_list) { +void StatementSimplifier::SimplifyExpressionList(duckdb::unique_ptr &expr, + vector> &expression_list) { + for (auto &child : expression_list) { SimplifyChildExpression(expr, child); } } -void StatementSimplifier::SimplifyExpressionList(vector> &expression_list, bool is_optional) { +void StatementSimplifier::SimplifyExpressionList(vector> &expression_list, + bool is_optional) { SimplifyList(expression_list, is_optional); - for(auto &child : expression_list) { + for (auto &child : expression_list) { SimplifyExpression(child); } } -void StatementSimplifier::SimplifyChildExpression(duckdb::unique_ptr &expr, unique_ptr &child) { +void StatementSimplifier::SimplifyChildExpression(duckdb::unique_ptr &expr, + unique_ptr &child) { if (!child) { return; } diff --git a/scripts/fuzzer_helper.py b/scripts/fuzzer_helper.py index d457b3172d49..dd82e0622490 100644 --- a/scripts/fuzzer_helper.py +++ b/scripts/fuzzer_helper.py @@ -29,6 +29,7 @@ footer = ''' ```''' + # github stuff def issue_url(): return 'https://api.github.com/repos/%s/%s/issues' % (REPO_OWNER, REPO_NAME) @@ -47,20 +48,22 @@ def get_token(): print("Incorrect length for FUZZEROFDUCKSKEY") exit(1) return token + + def create_session(): # Create an authenticated session to create the issue session = requests.Session() session.headers.update({'Authorization': 'token %s' % (get_token(),)}) return session + def make_github_issue(title, body): if len(title) > 240: # avoid title is too long error (maximum is 256 characters) title = title[:240] + '...' session = create_session() url = issue_url() - issue = {'title': title, - 'body': body} + issue = {'title': title, 'body': body} r = session.post(url, json.dumps(issue)) if r.status_code == 201: print('Successfully created Issue "%s"' % title) @@ -69,9 +72,10 @@ def make_github_issue(title, body): print('Response:', r.content.decode('utf8')) raise Exception("Failed to create issue") + def get_github_issues(page): session = create_session() - url = issue_url()+'?per_page=100&page='+str(page) + url = issue_url() + '?per_page=100&page=' + str(page) r = session.get(url) if r.status_code != 200: print('Failed to get list of issues') @@ -79,6 +83,7 @@ def get_github_issues(page): raise Exception("Failed to get list of issues") return json.loads(r.content.decode('utf8')) + def close_github_issue(number): session = create_session() url = issue_url() + '/' + str(number) @@ -91,6 +96,7 @@ def close_github_issue(number): print('Response:', r.content.decode('utf8')) raise Exception("Failed to close issue") + def label_github_issue(number, label): session = create_session() url = issue_url() + '/' + str(number) @@ -103,22 +109,26 @@ def label_github_issue(number, label): print('Response:', r.content.decode('utf8')) raise Exception("Failed to label issue") + def extract_issue(body, nr): try: splits = body.split(middle) sql = splits[0].split(header)[1] - error = splits[1][:-len(footer)] + error = splits[1][: -len(footer)] return (sql, error) except: print(f"Failed to extract SQL/error message from issue {nr}") print(body) return None + def run_shell_command_batch(shell, cmd): command = [shell, '--batch', '-init', '/dev/null'] try: - res = subprocess.run(command, input=bytearray(cmd, 'utf8'), stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=300) + res = subprocess.run( + command, input=bytearray(cmd, 'utf8'), stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=300 + ) except subprocess.TimeoutExpired: print(f"TIMEOUT... {cmd}") return ("", "", 0, True) @@ -126,6 +136,7 @@ def run_shell_command_batch(shell, cmd): stderr = res.stderr.decode('utf8').strip() return (stdout, stderr, res.returncode, False) + def test_reproducibility(shell, issue, current_errors, perform_check): extract = extract_issue(issue['body'], issue['number']) labels = issue['labels'] @@ -152,9 +163,10 @@ def test_reproducibility(shell, issue, current_errors, perform_check): current_errors[error] = issue return True + def extract_github_issues(shell, perform_check): current_errors = dict() - for p in range(1,10): + for p in range(1, 10): issues = get_github_issues(p) for issue in issues: # check if the github issue is still reproducible @@ -164,16 +176,23 @@ def extract_github_issues(shell, perform_check): close_github_issue(int(issue['number'])) return current_errors + def file_issue(cmd, error_msg, fuzzer, seed, hash): # issue is new, file it print("Filing new issue to Github") title = error_msg - body = fuzzer_desc.replace("${FUZZER}", fuzzer).replace("${FULL_HASH}", hash).replace("${SHORT_HASH}", hash[:5]).replace("${SEED}", str(seed)) + body = ( + fuzzer_desc.replace("${FUZZER}", fuzzer) + .replace("${FULL_HASH}", hash) + .replace("${SHORT_HASH}", hash[:5]) + .replace("${SEED}", str(seed)) + ) body += header + cmd + middle + error_msg + footer print(title, body) make_github_issue(title, body) + def is_internal_error(error): if 'differs from original result' in error: return True diff --git a/scripts/reduce_sql.py b/scripts/reduce_sql.py index 92a449b799b1..9d96bc41fca7 100644 --- a/scripts/reduce_sql.py +++ b/scripts/reduce_sql.py @@ -18,6 +18,7 @@ SELECT * FROM reduce_sql_statement('${QUERY}'); ''' + def sanitize_error(err): err = re.sub(r'Error: near line \d+: ', '', err) err = err.replace(os.getcwd() + '/', '') @@ -27,6 +28,7 @@ def sanitize_error(err): err = 'AddressSanitizer error ' + match return err + def run_shell_command(shell, cmd): command = [shell, '-csv', '--batch', '-init', '/dev/null'] @@ -35,6 +37,7 @@ def run_shell_command(shell, cmd): stderr = res.stderr.decode('utf8').strip() return (stdout, stderr, res.returncode) + def get_reduced_sql(shell, sql_query): reduce_query = get_reduced_query.replace('${QUERY}', sql_query.replace("'", "''")) (stdout, stderr, returncode) = run_shell_command(shell, reduce_query) @@ -47,6 +50,7 @@ def get_reduced_sql(shell, sql_query): reduce_candidates.append(line.strip('"').replace('""', '"')) return reduce_candidates[1:] + def reduce(sql_query, data_load, shell, error_msg, max_time_seconds=300): start = time.time() while True: @@ -73,18 +77,22 @@ def reduce(sql_query, data_load, shell, error_msg, max_time_seconds=300): break return sql_query + def is_ddl_query(query): query = query.lower() if 'create' in query or 'insert' in query or 'update' in query or 'delete' in query: return True return False + def initial_cleanup(query_log): query_log = query_log.replace('SELECT * FROM pragma_version()\n', '') return query_log + def run_queries_until_crash_mp(queries, result_file): import duckdb + con = duckdb.connect() sqlite_con = sqlite3.connect(result_file) sqlite_con.execute('CREATE TABLE queries(id INT, text VARCHAR)') @@ -109,7 +117,7 @@ def run_queries_until_crash_mp(queries, result_file): keep_query = True sqlite_con.execute('UPDATE result SET text=?', (exception_error,)) if not keep_query: - sqlite_con.execute('DELETE FROM queries WHERE id=?', (id, )) + sqlite_con.execute('DELETE FROM queries WHERE id=?', (id,)) if is_internal_error: # found internal error: no need to try further queries break @@ -120,6 +128,7 @@ def run_queries_until_crash_mp(queries, result_file): sqlite_con.commit() sqlite_con.close() + def run_queries_until_crash(queries): sqlite_file = 'cleaned_queries.db' if os.path.isfile(sqlite_file): @@ -147,8 +156,10 @@ def cleanup_irrelevant_queries(query_log): queries = [x for x in query_log.split(';\n') if len(x) > 0] return run_queries_until_crash(queries) + # def reduce_internal(start, sql_query, data_load, queries_final, shell, error_msg, max_time_seconds=300): + def reduce_query_log_query(start, shell, queries, query_index, max_time_seconds): new_query_list = queries[:] sql_query = queries[query_index] @@ -180,6 +191,7 @@ def reduce_query_log_query(start, shell, queries, query_index, max_time_seconds) break return sql_query + def reduce_query_log(queries, shell, max_time_seconds=300): start = time.time() current_index = 0 @@ -190,7 +202,7 @@ def reduce_query_log(queries, shell, max_time_seconds=300): if current_time - start > max_time_seconds: break # remove the query at "current_index" - new_queries = queries[:current_index] + queries[current_index + 1:] + new_queries = queries[:current_index] + queries[current_index + 1 :] # try to run the queries and check if we still get the same error (new_queries_x, current_error) = run_queries_until_crash(new_queries) if current_error is None: @@ -212,12 +224,19 @@ def reduce_query_log(queries, shell, max_time_seconds=300): if __name__ == "__main__": import argparse + parser = argparse.ArgumentParser(description='Reduce a problematic SQL query') - parser.add_argument('--shell', dest='shell', action='store', help='Path to the shell executable', default='build/debug/duckdb') + parser.add_argument( + '--shell', dest='shell', action='store', help='Path to the shell executable', default='build/debug/duckdb' + ) parser.add_argument('--load', dest='load', action='store', help='Path to the data load script', required=True) parser.add_argument('--exec', dest='exec', action='store', help='Path to the executable script', required=True) - parser.add_argument('--inplace', dest='inplace', action='store_true', help='If true, overrides the exec script with the final query') - parser.add_argument('--max-time', dest='max_time', action='store', help='Maximum time in seconds to run the reducer', default=300) + parser.add_argument( + '--inplace', dest='inplace', action='store_true', help='If true, overrides the exec script with the final query' + ) + parser.add_argument( + '--max-time', dest='max_time', action='store', help='Maximum time in seconds to run the reducer', default=300 + ) args = parser.parse_args() print("Starting reduce process") @@ -234,7 +253,6 @@ def reduce_query_log(queries, shell, max_time_seconds=300): print(expected_error) print("===================================================") - final_query = reduce(sql_query, data_load, shell, expected_error, args.max_time) print("Found final reduced query") print("===================================================") @@ -302,4 +320,4 @@ def reduce_query_log(queries, shell, max_time_seconds=300): # limit 88 # ''' # -# print(reduce(sql_query, data_load, shell, error_msg)) \ No newline at end of file +# print(reduce(sql_query, data_load, shell, error_msg)) diff --git a/scripts/run_fuzzer.py b/scripts/run_fuzzer.py index 57f7a677b3fd..c7d096f11ed9 100644 --- a/scripts/run_fuzzer.py +++ b/scripts/run_fuzzer.py @@ -50,6 +50,7 @@ git_hash = os.getenv('DUCKDB_HASH') + def create_db_script(db): if db == 'alltypes': return 'create table all_types as select * exclude(small_enum, medium_enum, large_enum) from test_all_types();' @@ -60,6 +61,7 @@ def create_db_script(db): else: raise Exception("Unknown database creation script") + def run_fuzzer_script(fuzzer): if fuzzer == 'sqlsmith': return "call sqlsmith(max_queries=${MAX_QUERIES}, seed=${SEED}, verbose_output=1, log='${LAST_LOG_FILE}', complete_log='${COMPLETE_LOG_FILE}');" @@ -70,6 +72,7 @@ def run_fuzzer_script(fuzzer): else: raise Exception("Unknown fuzzer type") + def get_fuzzer_name(fuzzer): if fuzzer == 'sqlsmith': return 'SQLSmith' @@ -80,6 +83,7 @@ def get_fuzzer_name(fuzzer): else: return 'Unknown' + def run_shell_command(cmd): command = [shell, '--batch', '-init', '/dev/null'] @@ -96,13 +100,21 @@ def run_shell_command(cmd): last_query_log_file = 'sqlsmith.log' complete_log_file = 'sqlsmith.complete.log' -print(f'''========================================== +print( + f'''========================================== RUNNING {fuzzer} on {db} -==========================================''') +==========================================''' +) load_script = create_db_script(db) fuzzer_name = get_fuzzer_name(fuzzer) -fuzzer = run_fuzzer_script(fuzzer).replace('${MAX_QUERIES}', str(max_queries)).replace('${LAST_LOG_FILE}', last_query_log_file).replace('${COMPLETE_LOG_FILE}', complete_log_file).replace('${SEED}', str(seed)) +fuzzer = ( + run_fuzzer_script(fuzzer) + .replace('${MAX_QUERIES}', str(max_queries)) + .replace('${LAST_LOG_FILE}', last_query_log_file) + .replace('${COMPLETE_LOG_FILE}', complete_log_file) + .replace('${SEED}', str(seed)) +) print(load_script) print(fuzzer) @@ -113,9 +125,11 @@ def run_shell_command(cmd): (stdout, stderr, returncode) = run_shell_command(cmd) -print(f'''========================================== +print( + f'''========================================== FINISHED RUNNING -==========================================''') +==========================================''' +) print("============== STDOUT ================") print(stdout) print("============== STDERR =================") @@ -160,7 +174,10 @@ def run_shell_command(cmd): # check if this is a duplicate issue if error_msg in current_errors: print("Skip filing duplicate issue") - print("Issue already exists: https://github.com/duckdb/duckdb-fuzzer/issues/" + str(current_errors[error_msg]['number'])) + print( + "Issue already exists: https://github.com/duckdb/duckdb-fuzzer/issues/" + + str(current_errors[error_msg]['number']) + ) exit(0) print(last_query) diff --git a/src/planner/binder/query_node/plan_subquery.cpp b/src/planner/binder/query_node/plan_subquery.cpp index 29e8f36c3063..3f3aaa92c9aa 100644 --- a/src/planner/binder/query_node/plan_subquery.cpp +++ b/src/planner/binder/query_node/plan_subquery.cpp @@ -411,8 +411,8 @@ void Binder::PlanSubqueries(unique_ptr &expr_ptr, unique_ptr Binder::PlanLateralJoin(unique_ptr left, unique_ptr right, - vector &correlated, - JoinType join_type, unique_ptr condition) { + vector &correlated, JoinType join_type, + unique_ptr condition) { // scan the right operator for correlated columns // correlated LATERAL JOIN vector conditions; diff --git a/src/planner/expression_binder/lateral_binder.cpp b/src/planner/expression_binder/lateral_binder.cpp index e58d78afd0b7..21ceb4e50c3f 100644 --- a/src/planner/expression_binder/lateral_binder.cpp +++ b/src/planner/expression_binder/lateral_binder.cpp @@ -107,7 +107,7 @@ class ExpressionDepthReducerRecursive : public BoundNodeVisitor { } static void ReduceExpressionSubquery(BoundSubqueryExpression &expr, - const vector &correlated_columns) { + const vector &correlated_columns) { ReduceColumnDepth(expr.binder->correlated_columns, correlated_columns); ExpressionDepthReducerRecursive recursive(correlated_columns); recursive.VisitBoundQueryNode(*expr.subquery); From 0d77ca409657e7a9e25ee8da812169e973b3470a Mon Sep 17 00:00:00 2001 From: Mark Raasveldt Date: Thu, 11 Apr 2024 22:58:23 +0200 Subject: [PATCH 142/147] Fix shell tests --- tools/shell/tests/test_shell_basics.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tools/shell/tests/test_shell_basics.py b/tools/shell/tests/test_shell_basics.py index 41d605017079..7fecae9cd1b0 100644 --- a/tools/shell/tests/test_shell_basics.py +++ b/tools/shell/tests/test_shell_basics.py @@ -326,14 +326,6 @@ def test_show_basic(shell): result = test.run() result.check_stdout("rowseparator") -def test_limit_error(shell): - test = ( - ShellTest(shell) - .statement(".limit length 42") - ) - result = test.run() - result.check_stderr("sqlite3_limit") - def test_timeout(shell): test = ( ShellTest(shell) @@ -1039,8 +1031,6 @@ def test_nullbyte_error_rendering(shell): result.check_stderr('INT32') @pytest.mark.parametrize("stmt", [ - "select decimal_mul(NULL, NULL);", - "select decimal_mul(NULL, i) FROM range(3) t(i);", "select sha3(NULL);" ]) def test_sqlite_udf_null(shell, stmt): From ec6186246b94cc7bfb387c0017f0f7c322a4cbdb Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 11 Apr 2024 12:10:24 +0200 Subject: [PATCH 143/147] LogicalDelete: Move from dynamic_cast to Cast --- src/planner/operator/logical_delete.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/planner/operator/logical_delete.cpp b/src/planner/operator/logical_delete.cpp index a028a1ea6f36..950f2eaa2ebb 100644 --- a/src/planner/operator/logical_delete.cpp +++ b/src/planner/operator/logical_delete.cpp @@ -14,7 +14,7 @@ LogicalDelete::LogicalDelete(TableCatalogEntry &table, idx_t table_index) LogicalDelete::LogicalDelete(ClientContext &context, const unique_ptr &table_info) : LogicalOperator(LogicalOperatorType::LOGICAL_DELETE), table(Catalog::GetEntry(context, table_info->catalog, table_info->schema, - dynamic_cast(*table_info).table)) { + table_info->Cast().table)) { } idx_t LogicalDelete::EstimateCardinality(ClientContext &context) { From aac4b382daea9a93baa7af23bf0b4b019c7f4c62 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 11 Apr 2024 12:11:01 +0200 Subject: [PATCH 144/147] LogicalUpdate: Move from dynamic_cast to Cast --- src/planner/operator/logical_update.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/planner/operator/logical_update.cpp b/src/planner/operator/logical_update.cpp index e66dd36d1a9c..edcfd5d891be 100644 --- a/src/planner/operator/logical_update.cpp +++ b/src/planner/operator/logical_update.cpp @@ -12,7 +12,7 @@ LogicalUpdate::LogicalUpdate(TableCatalogEntry &table) LogicalUpdate::LogicalUpdate(ClientContext &context, const unique_ptr &table_info) : LogicalOperator(LogicalOperatorType::LOGICAL_UPDATE), table(Catalog::GetEntry(context, table_info->catalog, table_info->schema, - dynamic_cast(*table_info).table)) { + table_info->Cast().table)) { } idx_t LogicalUpdate::EstimateCardinality(ClientContext &context) { From 55991db6925f17ea7623b8e5b6a9ef540f8945e4 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 11 Apr 2024 12:11:32 +0200 Subject: [PATCH 145/147] LogicalInsert: Move from dynamic_cast to Cast --- src/planner/operator/logical_insert.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/planner/operator/logical_insert.cpp b/src/planner/operator/logical_insert.cpp index 3846ed009742..518661616058 100644 --- a/src/planner/operator/logical_insert.cpp +++ b/src/planner/operator/logical_insert.cpp @@ -14,7 +14,7 @@ LogicalInsert::LogicalInsert(TableCatalogEntry &table, idx_t table_index) LogicalInsert::LogicalInsert(ClientContext &context, const unique_ptr table_info) : LogicalOperator(LogicalOperatorType::LOGICAL_INSERT), table(Catalog::GetEntry(context, table_info->catalog, table_info->schema, - dynamic_cast(*table_info).table)) { + table_info->Cast().table)) { } idx_t LogicalInsert::EstimateCardinality(ClientContext &context) { From 4c78ff0e4ecf83c70b0d73891fa02b755615b380 Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 11 Apr 2024 16:53:55 +0200 Subject: [PATCH 146/147] Add stricter runtime check on dynamic_check being a no-op --- src/include/duckdb/common/helper.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/include/duckdb/common/helper.hpp b/src/include/duckdb/common/helper.hpp index d4c07cc47091..c989983ba013 100644 --- a/src/include/duckdb/common/helper.hpp +++ b/src/include/duckdb/common/helper.hpp @@ -217,7 +217,8 @@ bool RefersToSameObject(const T &a, const T &b) { template void DynamicCastCheck(const SRC *source) { #ifndef __APPLE__ - D_ASSERT(dynamic_cast(source)); + // Actual check is on the fact that dynamic_cast and reinterpret_cast are equivalent + D_ASSERT(reinterpret_cast(source) == dynamic_cast(source)); #endif } From 39200d51d683ceb12414cc061129cb0ef706b9bf Mon Sep 17 00:00:00 2001 From: Carlo Piovesan Date: Thu, 11 Apr 2024 14:44:23 +0200 Subject: [PATCH 147/147] More dynamic_casts mirroring non-const case --- src/include/duckdb/catalog/catalog_entry.hpp | 2 +- src/include/duckdb/common/allocator.hpp | 2 +- src/include/duckdb/common/extra_type_info.hpp | 2 +- .../common/types/column/partitioned_column_data.hpp | 2 +- .../common/types/row/partitioned_tuple_data.hpp | 2 +- src/include/duckdb/common/types/vector_buffer.hpp | 2 +- .../duckdb/execution/expression_executor_state.hpp | 2 +- .../duckdb/execution/physical_operator_states.hpp | 12 ++++++------ src/include/duckdb/execution/window_executor.hpp | 2 +- src/include/duckdb/execution/window_segment_tree.hpp | 2 +- src/include/duckdb/function/cast/default_casts.hpp | 4 ++-- src/include/duckdb/function/compression_function.hpp | 8 ++++---- src/include/duckdb/function/copy_function.hpp | 6 +++--- src/include/duckdb/function/function.hpp | 4 ++-- src/include/duckdb/function/scalar_function.hpp | 2 +- src/include/duckdb/function/table_function.hpp | 6 +++--- src/include/duckdb/main/relation.hpp | 2 +- src/include/duckdb/parallel/event.hpp | 2 +- .../duckdb/parser/parsed_data/extra_drop_info.hpp | 2 +- src/include/duckdb/parser/parsed_data/parse_info.hpp | 2 +- src/include/duckdb/storage/data_pointer.hpp | 2 +- src/include/duckdb/storage/index.hpp | 2 +- src/include/duckdb/storage/storage_manager.hpp | 2 +- .../duckdb/storage/table/column_checkpoint_state.hpp | 2 +- src/include/duckdb/storage/table/scan_state.hpp | 4 ++-- src/include/duckdb/transaction/transaction.hpp | 2 +- 26 files changed, 41 insertions(+), 41 deletions(-) diff --git a/src/include/duckdb/catalog/catalog_entry.hpp b/src/include/duckdb/catalog/catalog_entry.hpp index fd0ed6e222b2..29c46fa10578 100644 --- a/src/include/duckdb/catalog/catalog_entry.hpp +++ b/src/include/duckdb/catalog/catalog_entry.hpp @@ -102,7 +102,7 @@ class CatalogEntry { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/common/allocator.hpp b/src/include/duckdb/common/allocator.hpp index 7c82f049aaa7..ac27d267a9db 100644 --- a/src/include/duckdb/common/allocator.hpp +++ b/src/include/duckdb/common/allocator.hpp @@ -36,7 +36,7 @@ struct PrivateAllocatorData { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/common/extra_type_info.hpp b/src/include/duckdb/common/extra_type_info.hpp index 5157293ca50e..8c8f8c0a62a7 100644 --- a/src/include/duckdb/common/extra_type_info.hpp +++ b/src/include/duckdb/common/extra_type_info.hpp @@ -50,7 +50,7 @@ struct ExtraTypeInfo { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } diff --git a/src/include/duckdb/common/types/column/partitioned_column_data.hpp b/src/include/duckdb/common/types/column/partitioned_column_data.hpp index 70caddbae666..058151cd56e9 100644 --- a/src/include/duckdb/common/types/column/partitioned_column_data.hpp +++ b/src/include/duckdb/common/types/column/partitioned_column_data.hpp @@ -117,7 +117,7 @@ class PartitionedColumnData { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp b/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp index 3a4c7f56179b..6ca89aa7ea3c 100644 --- a/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp +++ b/src/include/duckdb/common/types/row/partitioned_tuple_data.hpp @@ -181,7 +181,7 @@ class PartitionedTupleData { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/common/types/vector_buffer.hpp b/src/include/duckdb/common/types/vector_buffer.hpp index e1d49aacb629..7108b8dbf185 100644 --- a/src/include/duckdb/common/types/vector_buffer.hpp +++ b/src/include/duckdb/common/types/vector_buffer.hpp @@ -131,7 +131,7 @@ class VectorBuffer { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/execution/expression_executor_state.hpp b/src/include/duckdb/execution/expression_executor_state.hpp index c4bbc40c6b6b..c0802db2a308 100644 --- a/src/include/duckdb/execution/expression_executor_state.hpp +++ b/src/include/duckdb/execution/expression_executor_state.hpp @@ -46,7 +46,7 @@ struct ExpressionState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/execution/physical_operator_states.hpp b/src/include/duckdb/execution/physical_operator_states.hpp index b0bb166a975d..621c8124338d 100644 --- a/src/include/duckdb/execution/physical_operator_states.hpp +++ b/src/include/duckdb/execution/physical_operator_states.hpp @@ -52,7 +52,7 @@ class OperatorState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -69,7 +69,7 @@ class GlobalOperatorState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -90,7 +90,7 @@ class GlobalSinkState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } @@ -114,7 +114,7 @@ class LocalSinkState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -135,7 +135,7 @@ class GlobalSourceState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -152,7 +152,7 @@ class LocalSourceState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/execution/window_executor.hpp b/src/include/duckdb/execution/window_executor.hpp index 9a5051fa5fc0..8d134d05e188 100644 --- a/src/include/duckdb/execution/window_executor.hpp +++ b/src/include/duckdb/execution/window_executor.hpp @@ -131,7 +131,7 @@ class WindowExecutorState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/execution/window_segment_tree.hpp b/src/include/duckdb/execution/window_segment_tree.hpp index 0cfbff2874b7..bcdf87439f12 100644 --- a/src/include/duckdb/execution/window_segment_tree.hpp +++ b/src/include/duckdb/execution/window_segment_tree.hpp @@ -31,7 +31,7 @@ class WindowAggregatorState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } diff --git a/src/include/duckdb/function/cast/default_casts.hpp b/src/include/duckdb/function/cast/default_casts.hpp index e3d6072d5a2c..5d13c4354f58 100644 --- a/src/include/duckdb/function/cast/default_casts.hpp +++ b/src/include/duckdb/function/cast/default_casts.hpp @@ -30,7 +30,7 @@ struct BindCastInfo { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -48,7 +48,7 @@ struct BoundCastData { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/function/compression_function.hpp b/src/include/duckdb/function/compression_function.hpp index 4095e1826229..e34e2d0c35ef 100644 --- a/src/include/duckdb/function/compression_function.hpp +++ b/src/include/duckdb/function/compression_function.hpp @@ -39,7 +39,7 @@ struct AnalyzeState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -55,7 +55,7 @@ struct CompressionState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -76,7 +76,7 @@ struct CompressedSegmentState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -96,7 +96,7 @@ struct CompressionAppendState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/function/copy_function.hpp b/src/include/duckdb/function/copy_function.hpp index 99a01f8e0ba1..7e3a4808b435 100644 --- a/src/include/duckdb/function/copy_function.hpp +++ b/src/include/duckdb/function/copy_function.hpp @@ -30,7 +30,7 @@ struct LocalFunctionData { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -45,7 +45,7 @@ struct GlobalFunctionData { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -60,7 +60,7 @@ struct PreparedBatchData { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/function/function.hpp b/src/include/duckdb/function/function.hpp index 65a186f0525d..7e89476e2526 100644 --- a/src/include/duckdb/function/function.hpp +++ b/src/include/duckdb/function/function.hpp @@ -58,13 +58,13 @@ struct FunctionData { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } // FIXME: this function should be removed in the future template TARGET &CastNoConst() const { - return const_cast(reinterpret_cast(*this)); // NOLINT: FIXME + return const_cast(Cast()); // NOLINT: FIXME } }; diff --git a/src/include/duckdb/function/scalar_function.hpp b/src/include/duckdb/function/scalar_function.hpp index ab65c97a2139..917f09eed2b4 100644 --- a/src/include/duckdb/function/scalar_function.hpp +++ b/src/include/duckdb/function/scalar_function.hpp @@ -29,7 +29,7 @@ struct FunctionLocalState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/function/table_function.hpp b/src/include/duckdb/function/table_function.hpp index 3321d274f902..bd8e176973ee 100644 --- a/src/include/duckdb/function/table_function.hpp +++ b/src/include/duckdb/function/table_function.hpp @@ -36,7 +36,7 @@ struct TableFunctionInfo { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -60,7 +60,7 @@ struct GlobalTableFunctionState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -75,7 +75,7 @@ struct LocalTableFunctionState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/main/relation.hpp b/src/include/duckdb/main/relation.hpp index 7d1798712975..c16cb2a3e829 100644 --- a/src/include/duckdb/main/relation.hpp +++ b/src/include/duckdb/main/relation.hpp @@ -185,7 +185,7 @@ class Relation : public std::enable_shared_from_this { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/parallel/event.hpp b/src/include/duckdb/parallel/event.hpp index 89a108d98a98..794d1344f1a6 100644 --- a/src/include/duckdb/parallel/event.hpp +++ b/src/include/duckdb/parallel/event.hpp @@ -59,7 +59,7 @@ class Event : public std::enable_shared_from_this { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } diff --git a/src/include/duckdb/parser/parsed_data/extra_drop_info.hpp b/src/include/duckdb/parser/parsed_data/extra_drop_info.hpp index b85c6252359f..2812469deb5c 100644 --- a/src/include/duckdb/parser/parsed_data/extra_drop_info.hpp +++ b/src/include/duckdb/parser/parsed_data/extra_drop_info.hpp @@ -38,7 +38,7 @@ struct ExtraDropInfo { template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } virtual unique_ptr Copy() const = 0; diff --git a/src/include/duckdb/parser/parsed_data/parse_info.hpp b/src/include/duckdb/parser/parsed_data/parse_info.hpp index d547065e9ef7..5d395c6adfcf 100644 --- a/src/include/duckdb/parser/parsed_data/parse_info.hpp +++ b/src/include/duckdb/parser/parsed_data/parse_info.hpp @@ -48,7 +48,7 @@ struct ParseInfo { template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } diff --git a/src/include/duckdb/storage/data_pointer.hpp b/src/include/duckdb/storage/data_pointer.hpp index c0c51df679aa..97752ee5e141 100644 --- a/src/include/duckdb/storage/data_pointer.hpp +++ b/src/include/duckdb/storage/data_pointer.hpp @@ -34,7 +34,7 @@ struct ColumnSegmentState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/storage/index.hpp b/src/include/duckdb/storage/index.hpp index 179735e8c117..f5e89486b28f 100644 --- a/src/include/duckdb/storage/index.hpp +++ b/src/include/duckdb/storage/index.hpp @@ -160,7 +160,7 @@ class Index { template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/storage/storage_manager.hpp b/src/include/duckdb/storage/storage_manager.hpp index 91fc96755719..e0c07b6b8903 100644 --- a/src/include/duckdb/storage/storage_manager.hpp +++ b/src/include/duckdb/storage/storage_manager.hpp @@ -96,7 +96,7 @@ class StorageManager { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/storage/table/column_checkpoint_state.hpp b/src/include/duckdb/storage/table/column_checkpoint_state.hpp index 5ac11cf7cc28..1c1a68432375 100644 --- a/src/include/duckdb/storage/table/column_checkpoint_state.hpp +++ b/src/include/duckdb/storage/table/column_checkpoint_state.hpp @@ -51,7 +51,7 @@ struct ColumnCheckpointState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/storage/table/scan_state.hpp b/src/include/duckdb/storage/table/scan_state.hpp index 6c919cfba0ff..7b8160fdc089 100644 --- a/src/include/duckdb/storage/table/scan_state.hpp +++ b/src/include/duckdb/storage/table/scan_state.hpp @@ -44,7 +44,7 @@ struct SegmentScanState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; @@ -60,7 +60,7 @@ struct IndexScanState { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } }; diff --git a/src/include/duckdb/transaction/transaction.hpp b/src/include/duckdb/transaction/transaction.hpp index 1c47725c10dd..723c99460c52 100644 --- a/src/include/duckdb/transaction/transaction.hpp +++ b/src/include/duckdb/transaction/transaction.hpp @@ -66,7 +66,7 @@ class Transaction { } template const TARGET &Cast() const { - D_ASSERT(dynamic_cast(this)); + DynamicCastCheck(this); return reinterpret_cast(*this); } };