Skip to content

Commit 9d3904a

Browse files
alexander-beedieAlexander Beedie
authored andcommitted
Add support for IS [NOT] [form] NORMALIZED (apache#1655)
Co-authored-by: Alexander Beedie <alexander.beedie@adia.ae>
1 parent e6e5a6f commit 9d3904a

File tree

8 files changed

+185
-17
lines changed

8 files changed

+185
-17
lines changed

src/ast/mod.rs

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ pub use self::trigger::{
8383

8484
pub use self::value::{
8585
escape_double_quote_string, escape_quoted_string, DateTimeField, DollarQuotedString,
86-
TrimWhereField, Value,
86+
NormalizationForm, TrimWhereField, Value,
8787
};
8888

8989
use crate::ast::helpers::stmt_data_loading::{
@@ -653,6 +653,12 @@ pub enum Expr {
653653
IsDistinctFrom(Box<Expr>, Box<Expr>),
654654
/// `IS NOT DISTINCT FROM` operator
655655
IsNotDistinctFrom(Box<Expr>, Box<Expr>),
656+
/// `<expr> IS [ NOT ] [ form ] NORMALIZED`
657+
IsNormalized {
658+
expr: Box<Expr>,
659+
form: Option<NormalizationForm>,
660+
negated: bool,
661+
},
656662
/// `[ NOT ] IN (val1, val2, ...)`
657663
InList {
658664
expr: Box<Expr>,
@@ -1125,7 +1131,7 @@ impl fmt::Display for LambdaFunction {
11251131
/// `OneOrManyWithParens` implements `Deref<Target = [T]>` and `IntoIterator`,
11261132
/// so you can call slice methods on it and iterate over items
11271133
/// # Examples
1128-
/// Acessing as a slice:
1134+
/// Accessing as a slice:
11291135
/// ```
11301136
/// # use sqlparser::ast::OneOrManyWithParens;
11311137
/// let one = OneOrManyWithParens::One("a");
@@ -1437,6 +1443,24 @@ impl fmt::Display for Expr {
14371443
if *regexp { "REGEXP" } else { "RLIKE" },
14381444
pattern
14391445
),
1446+
Expr::IsNormalized {
1447+
expr,
1448+
form,
1449+
negated,
1450+
} => {
1451+
let not_ = if *negated { "NOT " } else { "" };
1452+
if form.is_none() {
1453+
write!(f, "{} IS {}NORMALIZED", expr, not_)
1454+
} else {
1455+
write!(
1456+
f,
1457+
"{} IS {}{} NORMALIZED",
1458+
expr,
1459+
not_,
1460+
form.as_ref().unwrap()
1461+
)
1462+
}
1463+
}
14401464
Expr::SimilarTo {
14411465
negated,
14421466
expr,
@@ -7817,7 +7841,7 @@ where
78177841
/// ```sql
78187842
/// EXPLAIN (ANALYZE, VERBOSE TRUE, FORMAT TEXT) SELECT * FROM my_table;
78197843
///
7820-
/// VACCUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table;
7844+
/// VACUUM (VERBOSE, ANALYZE ON, PARALLEL 10) my_table;
78217845
/// ```
78227846
#[derive(Debug, Clone, PartialEq, PartialOrd, Eq, Ord, Hash)]
78237847
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]

src/ast/query.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2821,10 +2821,10 @@ impl fmt::Display for ValueTableMode {
28212821
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
28222822
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
28232823
pub enum UpdateTableFromKind {
2824-
/// Update Statment where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake)
2824+
/// Update Statement where the 'FROM' clause is before the 'SET' keyword (Supported by Snowflake)
28252825
/// For Example: `UPDATE FROM t1 SET t1.name='aaa'`
28262826
BeforeSet(TableWithJoins),
2827-
/// Update Statment where the 'FROM' clause is after the 'SET' keyword (Which is the standard way)
2827+
/// Update Statement where the 'FROM' clause is after the 'SET' keyword (Which is the standard way)
28282828
/// For Example: `UPDATE SET t1.name='aaa' FROM t1`
28292829
AfterSet(TableWithJoins),
28302830
}

src/ast/spans.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1330,6 +1330,12 @@ impl Spanned for Expr {
13301330
escape_char: _,
13311331
any: _,
13321332
} => expr.span().union(&pattern.span()),
1333+
Expr::RLike { .. } => Span::empty(),
1334+
Expr::IsNormalized {
1335+
expr,
1336+
form: _,
1337+
negated: _,
1338+
} => expr.span(),
13331339
Expr::SimilarTo {
13341340
negated: _,
13351341
expr,
@@ -1365,7 +1371,6 @@ impl Spanned for Expr {
13651371
Expr::Array(array) => array.span(),
13661372
Expr::MatchAgainst { .. } => Span::empty(),
13671373
Expr::JsonAccess { value, path } => value.span().union(&path.span()),
1368-
Expr::RLike { .. } => Span::empty(),
13691374
Expr::AnyOp {
13701375
left,
13711376
compare_op: _,

src/ast/value.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,35 @@ impl fmt::Display for DateTimeField {
270270
}
271271
}
272272

273+
#[derive(Debug, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)]
274+
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
275+
#[cfg_attr(feature = "visitor", derive(Visit, VisitMut))]
276+
/// The Unicode Standard defines four normalization forms, which are intended to eliminate
277+
/// certain distinctions between visually or functionally identical characters.
278+
///
279+
/// See [Unicode Normalization Forms](https://unicode.org/reports/tr15/) for details.
280+
pub enum NormalizationForm {
281+
/// Canonical Decomposition, followed by Canonical Composition.
282+
NFC,
283+
/// Canonical Decomposition.
284+
NFD,
285+
/// Compatibility Decomposition, followed by Canonical Composition.
286+
NFKC,
287+
/// Compatibility Decomposition.
288+
NFKD,
289+
}
290+
291+
impl fmt::Display for NormalizationForm {
292+
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
293+
match self {
294+
NormalizationForm::NFC => write!(f, "NFC"),
295+
NormalizationForm::NFD => write!(f, "NFD"),
296+
NormalizationForm::NFKC => write!(f, "NFKC"),
297+
NormalizationForm::NFKD => write!(f, "NFKD"),
298+
}
299+
}
300+
}
301+
273302
pub struct EscapeQuotedString<'a> {
274303
string: &'a str,
275304
quote: char,

src/keywords.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -530,6 +530,10 @@ define_keywords!(
530530
NESTED,
531531
NEW,
532532
NEXT,
533+
NFC,
534+
NFD,
535+
NFKC,
536+
NFKD,
533537
NO,
534538
NOBYPASSRLS,
535539
NOCREATEDB,
@@ -540,6 +544,7 @@ define_keywords!(
540544
NOORDER,
541545
NOREPLICATION,
542546
NORMALIZE,
547+
NORMALIZED,
543548
NOSCAN,
544549
NOSUPERUSER,
545550
NOT,

src/parser/mod.rs

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3184,9 +3184,11 @@ impl<'a> Parser<'a> {
31843184
{
31853185
let expr2 = self.parse_expr()?;
31863186
Ok(Expr::IsNotDistinctFrom(Box::new(expr), Box::new(expr2)))
3187+
} else if let Ok(is_normalized) = self.parse_unicode_is_normalized(expr) {
3188+
Ok(is_normalized)
31873189
} else {
31883190
self.expected(
3189-
"[NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS",
3191+
"[NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS",
31903192
self.peek_token(),
31913193
)
31923194
}
@@ -3861,7 +3863,7 @@ impl<'a> Parser<'a> {
38613863
/// If the current token is the `expected` keyword, consume the token.
38623864
/// Otherwise, return an error.
38633865
///
3864-
// todo deprecate infavor of expected_keyword_is
3866+
// todo deprecate in favor of expected_keyword_is
38653867
pub fn expect_keyword(&mut self, expected: Keyword) -> Result<TokenWithSpan, ParserError> {
38663868
if self.parse_keyword(expected) {
38673869
Ok(self.get_current_token().clone())
@@ -8463,6 +8465,33 @@ impl<'a> Parser<'a> {
84638465
}
84648466
}
84658467

8468+
/// Parse a literal unicode normalization clause
8469+
pub fn parse_unicode_is_normalized(&mut self, expr: Expr) -> Result<Expr, ParserError> {
8470+
let neg = self.parse_keyword(Keyword::NOT);
8471+
let normalized_form = self.maybe_parse(|parser| {
8472+
match parser.parse_one_of_keywords(&[
8473+
Keyword::NFC,
8474+
Keyword::NFD,
8475+
Keyword::NFKC,
8476+
Keyword::NFKD,
8477+
]) {
8478+
Some(Keyword::NFC) => Ok(NormalizationForm::NFC),
8479+
Some(Keyword::NFD) => Ok(NormalizationForm::NFD),
8480+
Some(Keyword::NFKC) => Ok(NormalizationForm::NFKC),
8481+
Some(Keyword::NFKD) => Ok(NormalizationForm::NFKD),
8482+
_ => parser.expected("unicode normalization form", parser.peek_token()),
8483+
}
8484+
})?;
8485+
if self.parse_keyword(Keyword::NORMALIZED) {
8486+
return Ok(Expr::IsNormalized {
8487+
expr: Box::new(expr),
8488+
form: normalized_form,
8489+
negated: neg,
8490+
});
8491+
}
8492+
self.expected("unicode normalization form", self.peek_token())
8493+
}
8494+
84668495
pub fn parse_enum_values(&mut self) -> Result<Vec<EnumMember>, ParserError> {
84678496
self.expect_token(&Token::LParen)?;
84688497
let values = self.parse_comma_separated(|parser| {
@@ -8989,7 +9018,7 @@ impl<'a> Parser<'a> {
89899018
}
89909019
}
89919020

8992-
/// Parse a table object for insetion
9021+
/// Parse a table object for insertion
89939022
/// e.g. `some_database.some_table` or `FUNCTION some_table_func(...)`
89949023
pub fn parse_table_object(&mut self) -> Result<TableObject, ParserError> {
89959024
if self.dialect.supports_insert_table_function() && self.parse_keyword(Keyword::FUNCTION) {
@@ -11897,7 +11926,7 @@ impl<'a> Parser<'a> {
1189711926
} else {
1189811927
let mut name = self.parse_grantee_name()?;
1189911928
if self.consume_token(&Token::Colon) {
11900-
// Redshift supports namespace prefix for extenrnal users and groups:
11929+
// Redshift supports namespace prefix for external users and groups:
1190111930
// <Namespace>:<GroupName> or <Namespace>:<UserName>
1190211931
// https://docs.aws.amazon.com/redshift/latest/mgmt/redshift-iam-access-control-native-idp.html
1190311932
let ident = self.parse_identifier()?;
@@ -12893,7 +12922,7 @@ impl<'a> Parser<'a> {
1289312922
Ok(WithFill { from, to, step })
1289412923
}
1289512924

12896-
// Parse a set of comma seperated INTERPOLATE expressions (ClickHouse dialect)
12925+
// Parse a set of comma separated INTERPOLATE expressions (ClickHouse dialect)
1289712926
// that follow the INTERPOLATE keyword in an ORDER BY clause with the WITH FILL modifier
1289812927
pub fn parse_interpolations(&mut self) -> Result<Option<Interpolate>, ParserError> {
1289912928
if !self.parse_keyword(Keyword::INTERPOLATE) {
@@ -14442,7 +14471,7 @@ mod tests {
1444214471
assert_eq!(
1444314472
ast,
1444414473
Err(ParserError::ParserError(
14445-
"Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: a at Line: 1, Column: 16"
14474+
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: a at Line: 1, Column: 16"
1444614475
.to_string()
1444714476
))
1444814477
);

tests/sqlparser_common.rs

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4601,7 +4601,7 @@ fn run_explain_analyze(
46014601
expected_verbose: bool,
46024602
expected_analyze: bool,
46034603
expected_format: Option<AnalyzeFormat>,
4604-
exepcted_options: Option<Vec<UtilityOption>>,
4604+
expected_options: Option<Vec<UtilityOption>>,
46054605
) {
46064606
match dialect.verified_stmt(query) {
46074607
Statement::Explain {
@@ -4617,7 +4617,7 @@ fn run_explain_analyze(
46174617
assert_eq!(verbose, expected_verbose);
46184618
assert_eq!(analyze, expected_analyze);
46194619
assert_eq!(format, expected_format);
4620-
assert_eq!(options, exepcted_options);
4620+
assert_eq!(options, expected_options);
46214621
assert!(!query_plan);
46224622
assert!(!estimate);
46234623
assert_eq!("SELECT sqrt(id) FROM foo", statement.to_string());
@@ -9319,6 +9319,46 @@ fn parse_is_boolean() {
93199319
verified_expr(sql)
93209320
);
93219321

9322+
let sql = "a IS NORMALIZED";
9323+
assert_eq!(
9324+
IsNormalized {
9325+
expr: Box::new(Identifier(Ident::new("a"))),
9326+
form: None,
9327+
negated: false,
9328+
},
9329+
verified_expr(sql)
9330+
);
9331+
9332+
let sql = "a IS NOT NORMALIZED";
9333+
assert_eq!(
9334+
IsNormalized {
9335+
expr: Box::new(Identifier(Ident::new("a"))),
9336+
form: None,
9337+
negated: true,
9338+
},
9339+
verified_expr(sql)
9340+
);
9341+
9342+
let sql = "a IS NFKC NORMALIZED";
9343+
assert_eq!(
9344+
IsNormalized {
9345+
expr: Box::new(Identifier(Ident::new("a"))),
9346+
form: Some(NormalizationForm::NFKC),
9347+
negated: false,
9348+
},
9349+
verified_expr(sql)
9350+
);
9351+
9352+
let sql = "a IS NOT NFKD NORMALIZED";
9353+
assert_eq!(
9354+
IsNormalized {
9355+
expr: Box::new(Identifier(Ident::new("a"))),
9356+
form: Some(NormalizationForm::NFKD),
9357+
negated: true,
9358+
},
9359+
verified_expr(sql)
9360+
);
9361+
93229362
let sql = "a IS UNKNOWN";
93239363
assert_eq!(
93249364
IsUnknown(Box::new(Identifier(Ident::new("a")))),
@@ -9337,14 +9377,50 @@ fn parse_is_boolean() {
93379377
verified_stmt("SELECT f FROM foo WHERE field IS FALSE");
93389378
verified_stmt("SELECT f FROM foo WHERE field IS NOT FALSE");
93399379

9380+
verified_stmt("SELECT f FROM foo WHERE field IS NORMALIZED");
9381+
verified_stmt("SELECT f FROM foo WHERE field IS NFC NORMALIZED");
9382+
verified_stmt("SELECT f FROM foo WHERE field IS NFD NORMALIZED");
9383+
verified_stmt("SELECT f FROM foo WHERE field IS NOT NORMALIZED");
9384+
verified_stmt("SELECT f FROM foo WHERE field IS NOT NFKC NORMALIZED");
9385+
93409386
verified_stmt("SELECT f FROM foo WHERE field IS UNKNOWN");
93419387
verified_stmt("SELECT f FROM foo WHERE field IS NOT UNKNOWN");
93429388

93439389
let sql = "SELECT f from foo where field is 0";
93449390
let res = parse_sql_statements(sql);
93459391
assert_eq!(
93469392
ParserError::ParserError(
9347-
"Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: 0"
9393+
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: 0"
9394+
.to_string()
9395+
),
9396+
res.unwrap_err()
9397+
);
9398+
9399+
let sql = "SELECT s, s IS XYZ NORMALIZED FROM foo";
9400+
let res = parse_sql_statements(sql);
9401+
assert_eq!(
9402+
ParserError::ParserError(
9403+
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: XYZ"
9404+
.to_string()
9405+
),
9406+
res.unwrap_err()
9407+
);
9408+
9409+
let sql = "SELECT s, s IS NFKC FROM foo";
9410+
let res = parse_sql_statements(sql);
9411+
assert_eq!(
9412+
ParserError::ParserError(
9413+
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: FROM"
9414+
.to_string()
9415+
),
9416+
res.unwrap_err()
9417+
);
9418+
9419+
let sql = "SELECT s, s IS TRIM(' NFKC ') FROM foo";
9420+
let res = parse_sql_statements(sql);
9421+
assert_eq!(
9422+
ParserError::ParserError(
9423+
"Expected: [NOT] NULL | TRUE | FALSE | DISTINCT | [form] NORMALIZED FROM after IS, found: TRIM"
93489424
.to_string()
93499425
),
93509426
res.unwrap_err()
@@ -13006,7 +13082,7 @@ fn test_trailing_commas_in_from() {
1300613082
let sql = "SELECT a FROM b, WHERE c = 1";
1300713083
let _ = dialects.parse_sql_statements(sql).unwrap();
1300813084

13009-
// nasted
13085+
// nested
1301013086
let sql = "SELECT 1, 2 FROM (SELECT * FROM t,),";
1301113087
let _ = dialects.parse_sql_statements(sql).unwrap();
1301213088

tests/sqlparser_mysql.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2572,7 +2572,7 @@ fn parse_kill() {
25722572
}
25732573

25742574
#[test]
2575-
fn parse_table_colum_option_on_update() {
2575+
fn parse_table_column_option_on_update() {
25762576
let sql1 = "CREATE TABLE foo (`modification_time` DATETIME ON UPDATE CURRENT_TIMESTAMP())";
25772577
match mysql().verified_stmt(sql1) {
25782578
Statement::CreateTable(CreateTable { name, columns, .. }) => {

0 commit comments

Comments
 (0)