Skip to content

Commit

Permalink
small optimizaitons, allow larger than max xlsx row size when reading
Browse files Browse the repository at this point in the history
  • Loading branch information
Maxxen committed Dec 5, 2024
1 parent 0ac2aec commit 871dcf9
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 33 deletions.
6 changes: 3 additions & 3 deletions src/excel/include/xlsx/parsers/worksheet_parser.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ inline void SheetParserBase::OnStartElement(const char *name, const char **atts)
} else {
XLSXCellPos cref;
if (!cref.TryParse(cref_ptr)) {
throw InvalidInputException("Invalid cell reference in sheet");
throw InvalidInputException("Invalid cell reference in sheet: %s", cref_ptr);
}
if (cref.row != cell_pos.row) {
throw InvalidInputException("Cell reference does not match row reference in sheet");
Expand Down Expand Up @@ -165,10 +165,10 @@ class RangeSniffer final : public SheetParserBase {
inline XLSXCellRange RangeSniffer::GetRange() const {
if (beg_row == 0) {
// We didnt find any rows... return the whole sheet
return XLSXCellRange(1, 1, XLSX_MAX_CELL_ROWS, XLSX_MAX_CELL_COLS);
return XLSXCellRange();
}
// Otherwise, return the sniffed range
return XLSXCellRange(beg_row, beg_col, XLSX_MAX_CELL_ROWS, end_col + 1);
return XLSXCellRange(beg_row, beg_col, NumericLimits<idx_t>::Maximum(), end_col + 1);
}

inline void RangeSniffer::OnCell(const XLSXCellPos &pos, XLSXCellType type, vector<char> &data, idx_t style) {
Expand Down
50 changes: 24 additions & 26 deletions src/excel/include/xlsx/xlsx_parts.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,6 @@ inline const char *XLSXCellPos::TryParse(const char *str) {
p_col = p_col * 26 + (*str - 'A' + 1);
str++;
}

if (p_col > XLSX_MAX_CELL_COLS) {
return nullptr;
}
col = p_col;
} else {
did_not_parse_col = true;
Expand All @@ -67,9 +63,6 @@ inline const char *XLSXCellPos::TryParse(const char *str) {
p_row = p_row * 10 + (*str - '0');
str++;
}
if (p_row > XLSX_MAX_CELL_ROWS) {
return nullptr;
}
row = p_row;
} else {
did_not_parse_row = true;
Expand Down Expand Up @@ -109,15 +102,15 @@ inline string XLSXCellPos::GetColumnName() const {
struct XLSXCellRange {

// 1-indexed, inclusive
XLSXCellPos beg = {1, 1};
XLSXCellPos beg;
// 1-indexed, exlusive
XLSXCellPos end = {XLSX_MAX_CELL_ROWS, XLSX_MAX_CELL_COLS};
XLSXCellPos end;

XLSXCellRange(idx_t beg_row, idx_t beg_col, idx_t end_row, idx_t end_col)
: beg(beg_row, beg_col), end(end_row, end_col) {
}

XLSXCellRange() : beg(1, 1), end(XLSX_MAX_CELL_ROWS, XLSX_MAX_CELL_COLS) {
XLSXCellRange() : beg(1, 1), end(NumericLimits<idx_t>::Maximum(), XLSX_MAX_CELL_COLS) {
}

// Try to parse a range from a string, e.g. "A1:B2"
Expand Down Expand Up @@ -190,28 +183,33 @@ inline XLSXCellType ParseCellType(const char *ctype) {
if (!ctype) {
return XLSXCellType::NUMBER;
}
if (strcmp(ctype, "n") == 0) {
switch (*ctype++) {
case 'n':
return XLSXCellType::NUMBER;
case 's': {
if (*ctype == 0) {
return XLSXCellType::SHARED_STRING;
}
if (strcmp(ctype, "tr") == 0) {
return XLSXCellType::FORMULA_STRING;
}
return XLSXCellType::UNKNOWN;
}
if (strcmp(ctype, "s") == 0) {
return XLSXCellType::SHARED_STRING;
}
if (strcmp(ctype, "d") == 0) {
case 'd':
return XLSXCellType::DATE;
}
if (strcmp(ctype, "inlineStr") == 0) {
return XLSXCellType::INLINE_STRING;
}
if (strcmp(ctype, "str") == 0) {
return XLSXCellType::FORMULA_STRING;
}
if (strcmp(ctype, "b") == 0) {
case 'b':
return XLSXCellType::BOOLEAN;
}
if (strcmp(ctype, "e") == 0) {
case 'e':
return XLSXCellType::ERROR;
case 'i': {
if (strcmp(ctype, "nlineStr") == 0) {
return XLSXCellType::INLINE_STRING;
}
return XLSXCellType::UNKNOWN;
}
default:
return XLSXCellType::UNKNOWN;
}
return XLSXCellType::UNKNOWN;
}

//-------------------------------------------------------------------------
Expand Down
4 changes: 4 additions & 0 deletions src/excel/xlsx/read_xlsx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ void ReadXLSX::ParseOptions(XLSXReadOptions &options, const named_parameter_map_
if (!range.IsValid()) {
throw BinderException("Invalid range '%s' specified", range_str);
}
// We do allow more rows than the maximum, but not more columns, because DuckDB kind of breaks down otherwise.
if (range.Width() > XLSX_MAX_CELL_COLS) {
throw BinderException("Invalid range '%s' specified", range_str);
}

// Make sure the range is inclusive of the last cell
range.end.col++;
Expand Down
7 changes: 5 additions & 2 deletions src/excel/xlsx/zip_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,15 +311,18 @@ bool ZipFileReader::TryOpenEntry(const string &file_name) {
if (mz_zip_reader_locate_entry(handle, file_name.c_str(), 0) != MZ_OK) {
return false;
}

if (mz_zip_reader_entry_open(handle) != MZ_OK) {
return false;
}

const auto len = mz_zip_reader_entry_save_buffer_length(handle);
if (len < 0) {
mz_zip_file *file_info = nullptr;
if (mz_zip_reader_entry_get_info(handle, &file_info) != MZ_OK) {
return false;
}

const auto len = file_info->uncompressed_size;

is_entry_open = true;
entry_pos = 0;
entry_len = len;
Expand Down
3 changes: 1 addition & 2 deletions test/sql/excel/xlsx/tpch/tpch.test
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,4 @@ statement ok
COPY lineitem TO '__TEST_DIR__/lineitem.xlsx' (FORMAT 'XLSX');

statement ok
CREATE TABLE AS t1 SELECT * FROM '__TEST_DIR__/lineitem.xlsx';
----
CREATE TABLE t1 AS SELECT * FROM '__TEST_DIR__/lineitem.xlsx';

0 comments on commit 871dcf9

Please sign in to comment.