Skip to content

Commit

Permalink
stop at first empty row in range
Browse files Browse the repository at this point in the history
  • Loading branch information
Maxxen committed Nov 8, 2024
1 parent 4ddb1eb commit 697ec99
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 7 deletions.
30 changes: 23 additions & 7 deletions src/excel/xlsx/read_xlsx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -928,12 +928,13 @@ void HeaderSniffer::OnEndRow(const idx_t row_idx) {

class SheetParser final : public SheetParserBase {
public:
explicit SheetParser(ClientContext &context, const XLSXCellRange &range_p, const StringTable &table)
: string_table(table), range(range_p) {
explicit SheetParser(ClientContext &context, const XLSXCellRange &range_p, const StringTable &table, bool stop_at_empty_p)
: string_table(table), range(range_p), stop_at_empty(stop_at_empty_p) {

// Initialize the chunk
const vector<LogicalType> types(range.Width(), LogicalType::VARCHAR);
chunk.Initialize(context, types);
auto &buffer_alloc = BufferAllocator::Get(context);
chunk.Initialize(buffer_alloc, types);

// Set the beginning column
// Allocate the sheet row number mapping
Expand All @@ -947,7 +948,7 @@ class SheetParser final : public SheetParserBase {
const auto sheet_row = sheet_row_number[chunk_row];
const auto sheet_col = chunk_col + range.beg.col;

const XLSXCellPos pos = {sheet_col, static_cast<idx_t>(sheet_row)};
const XLSXCellPos pos = {static_cast<idx_t>(sheet_row), sheet_col};
return pos.ToString();
}

Expand All @@ -969,6 +970,9 @@ class SheetParser final : public SheetParserBase {

// The last column we wrote to
idx_t last_col = 0;

bool stop_at_empty = true;
bool is_row_empty = false;
};

void SheetParser::OnCell(const XLSXCellPos &pos, XLSXCellType type, vector<char> &data, idx_t style) {
Expand Down Expand Up @@ -1008,6 +1012,10 @@ void SheetParser::OnCell(const XLSXCellPos &pos, XLSXCellType type, vector<char>
ptr[out_index] = StringVector::AddString(vec, data.data(), data.size());
}

if(!data.empty()) {
is_row_empty = false;
}

last_col = pos.col;
}

Expand All @@ -1018,6 +1026,7 @@ void SheetParser::OnBeginRow(idx_t row_idx) {
}

last_col = range.beg.col - 1;
is_row_empty = true;
}

void SheetParser::OnEndRow(idx_t row_idx) {
Expand All @@ -1026,6 +1035,11 @@ void SheetParser::OnEndRow(idx_t row_idx) {
return;
}

if(stop_at_empty && is_row_empty) {
Stop(false);
return;
}

// If we didnt write out all the columns, pad with nulls
if(last_col + 1 < range.end.col) {
for(idx_t i = last_col + 1; i < range.end.col; i++) {
Expand Down Expand Up @@ -1174,6 +1188,7 @@ class XLSXFunctionData final : public TableFunctionData {
vector<XLSXCellType> source_types;

bool ignore_errors = false;
bool stop_at_empty = true;

// The range of the content in the sheet
XLSXCellRange content_range;
Expand Down Expand Up @@ -1246,6 +1261,7 @@ static unique_ptr<FunctionData> Bind(ClientContext &context, TableFunctionBindIn
range.end.row++;

detect_range = false;
result->stop_at_empty = false;
}

// Parse the styles (so we can handle dates)
Expand Down Expand Up @@ -1316,8 +1332,8 @@ static unique_ptr<FunctionData> Bind(ClientContext &context, TableFunctionBindIn
//-------------------------------------------------------------------
class XLSXGlobalState final : public GlobalTableFunctionState {
public:
explicit XLSXGlobalState(ClientContext &context, const string &file_name, const XLSXCellRange &range)
: archive(context, file_name), strings(BufferAllocator::Get(context)), parser(context, range, strings),
explicit XLSXGlobalState(ClientContext &context, const string &file_name, const XLSXCellRange &range, bool stop_at_empty)
: archive(context, file_name), strings(BufferAllocator::Get(context)), parser(context, range, strings, stop_at_empty),
buffer(make_unsafe_uniq_array_uninitialized<char>(BUFFER_SIZE)), cast_vec(LogicalType::DOUBLE) {}

ZipFileReader archive;
Expand All @@ -1339,7 +1355,7 @@ class XLSXGlobalState final : public GlobalTableFunctionState {

static unique_ptr<GlobalTableFunctionState> InitGlobal(ClientContext &context, TableFunctionInitInput &input) {
auto &data = input.bind_data->Cast<XLSXFunctionData>();
auto state = make_uniq<XLSXGlobalState>(context, data.file_path, data.content_range);
auto state = make_uniq<XLSXGlobalState>(context, data.file_path, data.content_range, data.stop_at_empty);

// Check if there is a string table. If there is, extract it
if(state->archive.TryOpenEntry("xl/sharedStrings.xml")) {
Expand Down
Binary file added test/data/xlsx/sparse.xlsx
Binary file not shown.

0 comments on commit 697ec99

Please sign in to comment.