Skip to content

Commit 0c74f1b

Browse files
authored
Merge pull request #9 from zfarrell/feature/parse-statements-issue-8
Add statement parsing functions
2 parents da3e91e + 1db1f99 commit 0c74f1b

File tree

8 files changed

+471
-3
lines changed

8 files changed

+471
-3
lines changed

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@ set(LOADABLE_EXTENSION_NAME ${TARGET_NAME}_loadable_extension)
99
project(${TARGET_NAME})
1010
include_directories(src/include)
1111

12-
set(EXTENSION_SOURCES
12+
set(EXTENSION_SOURCES
1313
src/parser_tools_extension.cpp
1414
src/parse_tables.cpp
1515
src/parse_where.cpp
1616
src/parse_functions.cpp
17+
src/parse_statements.cpp
1718
)
1819

1920
build_static_extension(${TARGET_NAME} ${EXTENSION_SOURCES})

README.md

Lines changed: 89 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,14 @@ An experimental DuckDB extension that exposes functionality from DuckDB's native
44

55
## Overview
66

7-
`parser_tools` is a DuckDB extension designed to provide SQL parsing capabilities within the database. It allows you to analyze SQL queries and extract structural information directly in SQL. This extension provides parsing functions for tables, WHERE clauses, and function calls (see [Functions](#functions) below).
7+
`parser_tools` is a DuckDB extension designed to provide SQL parsing capabilities within the database. It allows you to analyze SQL queries and extract structural information directly in SQL. This extension provides parsing functions for tables, WHERE clauses, function calls, and statements.
88

99
## Features
1010

1111
- **Extract table references** from a SQL query with context information (e.g. `FROM`, `JOIN`, etc.)
1212
- **Extract function calls** from a SQL query with context information (e.g. `SELECT`, `WHERE`, `HAVING`, etc.)
1313
- **Parse WHERE clauses** to extract conditions and operators
14+
- **Parse multi-statement SQL** to extract individual statements or count the number of statements
1415
- Support for **window functions**, **nested functions**, and **CTEs**
1516
- Includes **schema**, **name**, and **context** information for all extractions
1617
- Built on DuckDB's native SQL parser
@@ -94,7 +95,7 @@ Context helps identify where elements are used in the query.
9495

9596
## Functions
9697

97-
This extension provides parsing functions for tables, functions, and WHERE clauses. Each category includes both table functions (for detailed results) and scalar functions (for programmatic use).
98+
This extension provides parsing functions for tables, functions, WHERE clauses, and statements. Each category includes both table functions (for detailed results) and scalar functions (for programmatic use).
9899

99100
In general, errors (e.g. Parse Exception) will not be exposed to the user, but instead will result in an empty result. This simplifies batch processing. When validity is needed, [is_parsable](#is_parsablesql_query--scalar-function) can be used.
100101

@@ -319,6 +320,92 @@ FROM (VALUES
319320
└───────────────────────────────────────────────┴────────┘
320321
```
321322

323+
---
324+
325+
### Statement Parsing Functions
326+
327+
These functions parse multi-statement SQL strings and extract individual statements or count them.
328+
329+
#### `parse_statements(sql_query)` – Table Function
330+
331+
Parses a SQL string containing multiple statements and returns each statement as a separate row.
332+
333+
##### Usage
334+
```sql
335+
SELECT * FROM parse_statements('SELECT 42; SELECT 43;');
336+
```
337+
338+
##### Returns
339+
A table with:
340+
- `statement`: the SQL statement text
341+
342+
##### Example
343+
```sql
344+
SELECT * FROM parse_statements($$
345+
SELECT * FROM users WHERE active = true;
346+
INSERT INTO log VALUES ('query executed');
347+
SELECT count(*) FROM transactions;
348+
$$);
349+
```
350+
351+
| statement |
352+
|-----------|
353+
| SELECT * FROM users WHERE (active = true) |
354+
| INSERT INTO log (VALUES ('query executed')) |
355+
| SELECT count_star() FROM transactions |
356+
357+
---
358+
359+
#### `parse_statements(sql_query)` – Scalar Function
360+
361+
Returns a list of statement strings from a multi-statement SQL query.
362+
363+
##### Usage
364+
```sql
365+
SELECT parse_statements('SELECT 42; SELECT 43;');
366+
----
367+
[SELECT 42, SELECT 43]
368+
```
369+
370+
##### Returns
371+
A list of strings, each being a SQL statement.
372+
373+
##### Example
374+
```sql
375+
SELECT parse_statements('SELECT 1; INSERT INTO test VALUES (2); SELECT 3;');
376+
----
377+
[SELECT 1, 'INSERT INTO test (VALUES (2))', SELECT 3]
378+
```
379+
380+
---
381+
382+
#### `num_statements(sql_query)` – Scalar Function
383+
384+
Returns the number of statements in a multi-statement SQL query.
385+
386+
##### Usage
387+
```sql
388+
SELECT num_statements('SELECT 42; SELECT 43;');
389+
----
390+
2
391+
```
392+
393+
##### Returns
394+
An integer count of the number of SQL statements.
395+
396+
##### Example
397+
```sql
398+
SELECT num_statements($$
399+
WITH cte AS (SELECT 1) SELECT * FROM cte;
400+
UPDATE users SET last_seen = now();
401+
SELECT count(*) FROM users;
402+
DELETE FROM temp_data;
403+
$$);
404+
----
405+
4
406+
```
407+
408+
---
322409

323410
## Development
324411

src/include/parse_statements.hpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#pragma once
2+
3+
#include "duckdb.hpp"
4+
#include <string>
5+
#include <vector>
6+
7+
namespace duckdb {
8+
9+
// Forward declarations
10+
class ExtensionLoader;
11+
12+
struct StatementResult {
13+
std::string statement;
14+
};
15+
16+
void RegisterParseStatementsFunction(ExtensionLoader &loader);
17+
void RegisterParseStatementsScalarFunction(ExtensionLoader &loader);
18+
19+
} // namespace duckdb

src/parse_statements.cpp

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
#include "parse_statements.hpp"
2+
#include "duckdb.hpp"
3+
#include "duckdb/parser/parser.hpp"
4+
#include "duckdb/parser/statement/select_statement.hpp"
5+
#include "duckdb/function/scalar/nested_functions.hpp"
6+
7+
namespace duckdb {
8+
9+
struct ParseStatementsState : public GlobalTableFunctionState {
10+
idx_t row = 0;
11+
vector<StatementResult> results;
12+
};
13+
14+
struct ParseStatementsBindData : public TableFunctionData {
15+
string sql;
16+
};
17+
18+
// BIND function: runs during query planning to decide output schema
19+
static unique_ptr<FunctionData> ParseStatementsBind(ClientContext &context,
20+
TableFunctionBindInput &input,
21+
vector<LogicalType> &return_types,
22+
vector<string> &names) {
23+
24+
string sql_input = StringValue::Get(input.inputs[0]);
25+
26+
// Return single column with statement text
27+
return_types = {LogicalType::VARCHAR};
28+
names = {"statement"};
29+
30+
// Create a bind data object to hold the SQL input
31+
auto result = make_uniq<ParseStatementsBindData>();
32+
result->sql = sql_input;
33+
34+
return std::move(result);
35+
}
36+
37+
// INIT function: runs before table function execution
38+
static unique_ptr<GlobalTableFunctionState> ParseStatementsInit(ClientContext &context,
39+
TableFunctionInitInput &input) {
40+
return make_uniq<ParseStatementsState>();
41+
}
42+
43+
static void ExtractStatementsFromSQL(const std::string &sql, std::vector<StatementResult> &results) {
44+
Parser parser;
45+
46+
try {
47+
parser.ParseQuery(sql);
48+
} catch (const ParserException &ex) {
49+
// Swallow parser exceptions to make this function more robust
50+
return;
51+
}
52+
53+
for (auto &stmt : parser.statements) {
54+
if (stmt) {
55+
// Convert statement back to string
56+
auto statement_str = stmt->ToString();
57+
results.push_back(StatementResult{statement_str});
58+
}
59+
}
60+
}
61+
62+
static void ParseStatementsFunction(ClientContext &context,
63+
TableFunctionInput &data,
64+
DataChunk &output) {
65+
auto &state = (ParseStatementsState &)*data.global_state;
66+
auto &bind_data = (ParseStatementsBindData &)*data.bind_data;
67+
68+
if (state.results.empty() && state.row == 0) {
69+
ExtractStatementsFromSQL(bind_data.sql, state.results);
70+
}
71+
72+
if (state.row >= state.results.size()) {
73+
return;
74+
}
75+
76+
auto &stmt = state.results[state.row];
77+
output.SetCardinality(1);
78+
output.SetValue(0, 0, Value(stmt.statement));
79+
80+
state.row++;
81+
}
82+
83+
static void ParseStatementsScalarFunction(DataChunk &args, ExpressionState &state, Vector &result) {
84+
UnaryExecutor::Execute<string_t, list_entry_t>(args.data[0], result, args.size(),
85+
[&result](string_t query) -> list_entry_t {
86+
// Parse the SQL query and extract statements
87+
auto query_string = query.GetString();
88+
std::vector<StatementResult> parsed_statements;
89+
ExtractStatementsFromSQL(query_string, parsed_statements);
90+
91+
auto current_size = ListVector::GetListSize(result);
92+
auto number_of_statements = parsed_statements.size();
93+
auto new_size = current_size + number_of_statements;
94+
95+
// Grow list if needed
96+
if (ListVector::GetListCapacity(result) < new_size) {
97+
ListVector::Reserve(result, new_size);
98+
}
99+
100+
// Write the statements into the child vector
101+
auto statements = FlatVector::GetData<string_t>(ListVector::GetEntry(result));
102+
for (size_t i = 0; i < parsed_statements.size(); i++) {
103+
auto &stmt = parsed_statements[i];
104+
statements[current_size + i] = StringVector::AddStringOrBlob(ListVector::GetEntry(result), stmt.statement);
105+
}
106+
107+
// Update size
108+
ListVector::SetListSize(result, new_size);
109+
110+
return list_entry_t(current_size, number_of_statements);
111+
});
112+
}
113+
114+
static void NumStatementsScalarFunction(DataChunk &args, ExpressionState &state, Vector &result) {
115+
UnaryExecutor::Execute<string_t, int64_t>(args.data[0], result, args.size(),
116+
[](string_t query) -> int64_t {
117+
// Parse the SQL query and count statements
118+
auto query_string = query.GetString();
119+
std::vector<StatementResult> parsed_statements;
120+
ExtractStatementsFromSQL(query_string, parsed_statements);
121+
122+
return static_cast<int64_t>(parsed_statements.size());
123+
});
124+
}
125+
126+
// Extension scaffolding
127+
// ---------------------------------------------------
128+
129+
void RegisterParseStatementsFunction(ExtensionLoader &loader) {
130+
// Table function that returns one row per statement
131+
TableFunction tf("parse_statements", {LogicalType::VARCHAR}, ParseStatementsFunction, ParseStatementsBind, ParseStatementsInit);
132+
loader.RegisterFunction(tf);
133+
}
134+
135+
void RegisterParseStatementsScalarFunction(ExtensionLoader &loader) {
136+
// parse_statements is a scalar function that returns a list of statement strings
137+
ScalarFunction sf("parse_statements", {LogicalType::VARCHAR}, LogicalType::LIST(LogicalType::VARCHAR), ParseStatementsScalarFunction);
138+
loader.RegisterFunction(sf);
139+
140+
// num_statements is a scalar function that returns the count of statements
141+
ScalarFunction num_sf("num_statements", {LogicalType::VARCHAR}, LogicalType::BIGINT, NumStatementsScalarFunction);
142+
loader.RegisterFunction(num_sf);
143+
}
144+
145+
} // namespace duckdb

src/parser_tools_extension.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "parse_tables.hpp"
55
#include "parse_where.hpp"
66
#include "parse_functions.hpp"
7+
#include "parse_statements.hpp"
78
#include "duckdb.hpp"
89
#include "duckdb/common/exception.hpp"
910
#include "duckdb/common/string_util.hpp"
@@ -30,6 +31,8 @@ static void LoadInternal(ExtensionLoader &loader) {
3031
RegisterParseWhereDetailedFunction(loader);
3132
RegisterParseFunctionsFunction(loader);
3233
RegisterParseFunctionScalarFunction(loader);
34+
RegisterParseStatementsFunction(loader);
35+
RegisterParseStatementsScalarFunction(loader);
3336
}
3437

3538
void ParserToolsExtension::Load(ExtensionLoader &loader) {
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
# name: test/sql/parser_tools/scalar_functions/num_statements.test
2+
# description: test num_statements scalar function
3+
# group: [num_statements]
4+
5+
# Before we load the extension, this will fail
6+
statement error
7+
SELECT num_statements('SELECT 42; SELECT 43;');
8+
----
9+
Catalog Error: Scalar Function with name num_statements does not exist!
10+
11+
# Require statement will ensure this test is run with this extension loaded
12+
require parser_tools
13+
14+
# Single statement
15+
query I
16+
SELECT num_statements('SELECT 42;');
17+
----
18+
1
19+
20+
# Two statements
21+
query I
22+
SELECT num_statements('SELECT 42; SELECT 43;');
23+
----
24+
2
25+
26+
# Three statements
27+
query I
28+
SELECT num_statements('SELECT 1; SELECT 2; SELECT 3;');
29+
----
30+
3
31+
32+
# Multiple statements with different types
33+
query I
34+
SELECT num_statements('SELECT 1; INSERT INTO test VALUES (2); UPDATE test SET x = 1; SELECT 3;');
35+
----
36+
4
37+
38+
# Complex multi-statement query
39+
query I
40+
SELECT num_statements($$
41+
WITH cte AS (SELECT 1 as a) SELECT * FROM cte;
42+
SELECT upper('hello');
43+
SELECT count(*) FROM users WHERE id > 10;
44+
INSERT INTO log VALUES ('done');
45+
$$);
46+
----
47+
4
48+
49+
# Single complex statement
50+
query I
51+
SELECT num_statements($$
52+
WITH cte1 AS (SELECT * FROM table1),
53+
cte2 AS (SELECT * FROM table2)
54+
SELECT cte1.id, cte2.name
55+
FROM cte1
56+
JOIN cte2 ON cte1.id = cte2.id
57+
WHERE cte1.active = true
58+
ORDER BY cte1.created_at DESC;
59+
$$);
60+
----
61+
1
62+
63+
# Empty input
64+
query I
65+
SELECT num_statements('');
66+
----
67+
0
68+
69+
# Whitespace only
70+
query I
71+
SELECT num_statements(' ');
72+
----
73+
0
74+
75+
# Invalid SQL should return 0
76+
query I
77+
SELECT num_statements('INVALID SQL SYNTAX HERE');
78+
----
79+
0

0 commit comments

Comments
 (0)