Skip to content

Commit b0925c8

Browse files
authored
Support alternate format for Utf8 unparsing (CHAR) (apache#11494)
* Add dialect param to use CHAR instead of TEXT for Utf8 unparsing for MySQL (#12) * Configurable data type instead of flag for Utf8 unparsing * Fix type in comment
1 parent de0765a commit b0925c8

File tree

2 files changed

+83
-3
lines changed

2 files changed

+83
-3
lines changed

datafusion/sql/src/unparser/dialect.rs

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
// under the License.
1717

1818
use regex::Regex;
19-
use sqlparser::keywords::ALL_KEYWORDS;
19+
use sqlparser::{ast, keywords::ALL_KEYWORDS};
2020

2121
/// `Dialect` to use for Unparsing
2222
///
@@ -45,6 +45,17 @@ pub trait Dialect {
4545
fn interval_style(&self) -> IntervalStyle {
4646
IntervalStyle::PostgresVerbose
4747
}
48+
49+
// The SQL type to use for Arrow Utf8 unparsing
50+
// Most dialects use VARCHAR, but some, like MySQL, require CHAR
51+
fn utf8_cast_dtype(&self) -> ast::DataType {
52+
ast::DataType::Varchar(None)
53+
}
54+
// The SQL type to use for Arrow LargeUtf8 unparsing
55+
// Most dialects use TEXT, but some, like MySQL, require CHAR
56+
fn large_utf8_cast_dtype(&self) -> ast::DataType {
57+
ast::DataType::Text
58+
}
4859
}
4960

5061
/// `IntervalStyle` to use for unparsing
@@ -103,6 +114,14 @@ impl Dialect for MySqlDialect {
103114
fn interval_style(&self) -> IntervalStyle {
104115
IntervalStyle::MySQL
105116
}
117+
118+
fn utf8_cast_dtype(&self) -> ast::DataType {
119+
ast::DataType::Char(None)
120+
}
121+
122+
fn large_utf8_cast_dtype(&self) -> ast::DataType {
123+
ast::DataType::Char(None)
124+
}
106125
}
107126

108127
pub struct SqliteDialect {}
@@ -118,6 +137,8 @@ pub struct CustomDialect {
118137
supports_nulls_first_in_sort: bool,
119138
use_timestamp_for_date64: bool,
120139
interval_style: IntervalStyle,
140+
utf8_cast_dtype: ast::DataType,
141+
large_utf8_cast_dtype: ast::DataType,
121142
}
122143

123144
impl Default for CustomDialect {
@@ -127,6 +148,8 @@ impl Default for CustomDialect {
127148
supports_nulls_first_in_sort: true,
128149
use_timestamp_for_date64: false,
129150
interval_style: IntervalStyle::SQLStandard,
151+
utf8_cast_dtype: ast::DataType::Varchar(None),
152+
large_utf8_cast_dtype: ast::DataType::Text,
130153
}
131154
}
132155
}
@@ -158,6 +181,14 @@ impl Dialect for CustomDialect {
158181
fn interval_style(&self) -> IntervalStyle {
159182
self.interval_style
160183
}
184+
185+
fn utf8_cast_dtype(&self) -> ast::DataType {
186+
self.utf8_cast_dtype.clone()
187+
}
188+
189+
fn large_utf8_cast_dtype(&self) -> ast::DataType {
190+
self.large_utf8_cast_dtype.clone()
191+
}
161192
}
162193

163194
/// `CustomDialectBuilder` to build `CustomDialect` using builder pattern
@@ -179,6 +210,8 @@ pub struct CustomDialectBuilder {
179210
supports_nulls_first_in_sort: bool,
180211
use_timestamp_for_date64: bool,
181212
interval_style: IntervalStyle,
213+
utf8_cast_dtype: ast::DataType,
214+
large_utf8_cast_dtype: ast::DataType,
182215
}
183216

184217
impl Default for CustomDialectBuilder {
@@ -194,6 +227,8 @@ impl CustomDialectBuilder {
194227
supports_nulls_first_in_sort: true,
195228
use_timestamp_for_date64: false,
196229
interval_style: IntervalStyle::PostgresVerbose,
230+
utf8_cast_dtype: ast::DataType::Varchar(None),
231+
large_utf8_cast_dtype: ast::DataType::Text,
197232
}
198233
}
199234

@@ -203,6 +238,8 @@ impl CustomDialectBuilder {
203238
supports_nulls_first_in_sort: self.supports_nulls_first_in_sort,
204239
use_timestamp_for_date64: self.use_timestamp_for_date64,
205240
interval_style: self.interval_style,
241+
utf8_cast_dtype: self.utf8_cast_dtype,
242+
large_utf8_cast_dtype: self.large_utf8_cast_dtype,
206243
}
207244
}
208245

@@ -235,4 +272,17 @@ impl CustomDialectBuilder {
235272
self.interval_style = interval_style;
236273
self
237274
}
275+
276+
pub fn with_utf8_cast_dtype(mut self, utf8_cast_dtype: ast::DataType) -> Self {
277+
self.utf8_cast_dtype = utf8_cast_dtype;
278+
self
279+
}
280+
281+
pub fn with_large_utf8_cast_dtype(
282+
mut self,
283+
large_utf8_cast_dtype: ast::DataType,
284+
) -> Self {
285+
self.large_utf8_cast_dtype = large_utf8_cast_dtype;
286+
self
287+
}
238288
}

datafusion/sql/src/unparser/expr.rs

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,8 +1275,8 @@ impl Unparser<'_> {
12751275
DataType::BinaryView => {
12761276
not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
12771277
}
1278-
DataType::Utf8 => Ok(ast::DataType::Varchar(None)),
1279-
DataType::LargeUtf8 => Ok(ast::DataType::Text),
1278+
DataType::Utf8 => Ok(self.dialect.utf8_cast_dtype()),
1279+
DataType::LargeUtf8 => Ok(self.dialect.large_utf8_cast_dtype()),
12801280
DataType::Utf8View => {
12811281
not_impl_err!("Unsupported DataType: conversion: {data_type:?}")
12821282
}
@@ -1936,4 +1936,34 @@ mod tests {
19361936
assert_eq!(actual, expected);
19371937
}
19381938
}
1939+
1940+
#[test]
1941+
fn custom_dialect_use_char_for_utf8_cast() -> Result<()> {
1942+
let default_dialect = CustomDialectBuilder::default().build();
1943+
let mysql_custom_dialect = CustomDialectBuilder::new()
1944+
.with_utf8_cast_dtype(ast::DataType::Char(None))
1945+
.with_large_utf8_cast_dtype(ast::DataType::Char(None))
1946+
.build();
1947+
1948+
for (dialect, data_type, identifier) in [
1949+
(&default_dialect, DataType::Utf8, "VARCHAR"),
1950+
(&default_dialect, DataType::LargeUtf8, "TEXT"),
1951+
(&mysql_custom_dialect, DataType::Utf8, "CHAR"),
1952+
(&mysql_custom_dialect, DataType::LargeUtf8, "CHAR"),
1953+
] {
1954+
let unparser = Unparser::new(dialect);
1955+
1956+
let expr = Expr::Cast(Cast {
1957+
expr: Box::new(col("a")),
1958+
data_type,
1959+
});
1960+
let ast = unparser.expr_to_sql(&expr)?;
1961+
1962+
let actual = format!("{}", ast);
1963+
let expected = format!(r#"CAST(a AS {identifier})"#);
1964+
1965+
assert_eq!(actual, expected);
1966+
}
1967+
Ok(())
1968+
}
19391969
}

0 commit comments

Comments
 (0)