@@ -35,7 +35,9 @@ use serde::{Deserialize, Serialize};
3535use sqlparser_derive:: { Visit , VisitMut } ;
3636
3737use crate :: ast:: DollarQuotedString ;
38- use crate :: dialect:: { BigQueryDialect , DuckDbDialect , GenericDialect , SnowflakeDialect } ;
38+ use crate :: dialect:: {
39+ BigQueryDialect , DuckDbDialect , GenericDialect , HiveDialect , SnowflakeDialect ,
40+ } ;
3941use crate :: dialect:: { Dialect , MySqlDialect } ;
4042use crate :: keywords:: { Keyword , ALL_KEYWORDS , ALL_KEYWORDS_INDEX } ;
4143
@@ -495,9 +497,32 @@ impl<'a> Tokenizer<'a> {
495497 Ok ( tokens)
496498 }
497499
500+ fn tokenize_identifier_or_keyword (
501+ & self ,
502+ ch : String ,
503+ chars : & mut State ,
504+ ) -> Result < Option < Token > , TokenizerError > {
505+ chars. next ( ) ; // consume the first char
506+ let word = self . tokenize_word ( ch, chars) ;
507+
508+ // TODO: implement parsing of exponent here
509+ if word. chars ( ) . all ( |x| x. is_ascii_digit ( ) || x == '.' ) {
510+ let mut inner_state = State {
511+ peekable : word. chars ( ) . peekable ( ) ,
512+ line : 0 ,
513+ col : 0 ,
514+ } ;
515+ let mut s = peeking_take_while ( & mut inner_state, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
516+ let s2 = peeking_take_while ( chars, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
517+ s += s2. as_str ( ) ;
518+ return Ok ( Some ( Token :: Number ( s, false ) ) ) ;
519+ }
520+
521+ Ok ( Some ( Token :: make_word ( & word, None ) ) )
522+ }
523+
498524 /// Get the next token or return None
499525 fn next_token ( & self , chars : & mut State ) -> Result < Option < Token > , TokenizerError > {
500- //println!("next_token: {:?}", chars.peek());
501526 match chars. peek ( ) {
502527 Some ( & ch) => match ch {
503528 ' ' => self . consume_and_return ( chars, Token :: Whitespace ( Whitespace :: Space ) ) ,
@@ -525,7 +550,7 @@ impl<'a> Tokenizer<'a> {
525550 }
526551 _ => {
527552 // regular identifier starting with an "b" or "B"
528- let s = self . tokenize_word ( b, chars) ;
553+ let s = self . tokenize_word ( b. to_string ( ) , chars) ;
529554 Ok ( Some ( Token :: make_word ( & s, None ) ) )
530555 }
531556 }
@@ -544,7 +569,7 @@ impl<'a> Tokenizer<'a> {
544569 }
545570 _ => {
546571 // regular identifier starting with an "r" or "R"
547- let s = self . tokenize_word ( b, chars) ;
572+ let s = self . tokenize_word ( b. to_string ( ) , chars) ;
548573 Ok ( Some ( Token :: make_word ( & s, None ) ) )
549574 }
550575 }
@@ -560,7 +585,7 @@ impl<'a> Tokenizer<'a> {
560585 }
561586 _ => {
562587 // regular identifier starting with an "N"
563- let s = self . tokenize_word ( n, chars) ;
588+ let s = self . tokenize_word ( n. to_string ( ) , chars) ;
564589 Ok ( Some ( Token :: make_word ( & s, None ) ) )
565590 }
566591 }
@@ -577,7 +602,7 @@ impl<'a> Tokenizer<'a> {
577602 }
578603 _ => {
579604 // regular identifier starting with an "E" or "e"
580- let s = self . tokenize_word ( x, chars) ;
605+ let s = self . tokenize_word ( x. to_string ( ) , chars) ;
581606 Ok ( Some ( Token :: make_word ( & s, None ) ) )
582607 }
583608 }
@@ -594,33 +619,11 @@ impl<'a> Tokenizer<'a> {
594619 }
595620 _ => {
596621 // regular identifier starting with an "X"
597- let s = self . tokenize_word ( x, chars) ;
622+ let s = self . tokenize_word ( x. to_string ( ) , chars) ;
598623 Ok ( Some ( Token :: make_word ( & s, None ) ) )
599624 }
600625 }
601626 }
602- // identifier or keyword
603- ch if self . dialect . is_identifier_start ( ch) => {
604- chars. next ( ) ; // consume the first char
605- let word = self . tokenize_word ( ch, chars) ;
606-
607- // TODO: implement parsing of exponent here
608- if word. chars ( ) . all ( |x| x. is_ascii_digit ( ) || x == '.' ) {
609- let mut inner_state = State {
610- peekable : word. chars ( ) . peekable ( ) ,
611- line : 0 ,
612- col : 0 ,
613- } ;
614- let mut s = peeking_take_while ( & mut inner_state, |ch| {
615- matches ! ( ch, '0' ..='9' | '.' )
616- } ) ;
617- let s2 = peeking_take_while ( chars, |ch| matches ! ( ch, '0' ..='9' | '.' ) ) ;
618- s += s2. as_str ( ) ;
619- return Ok ( Some ( Token :: Number ( s, false ) ) ) ;
620- }
621-
622- Ok ( Some ( Token :: make_word ( & word, None ) ) )
623- }
624627 // single quoted string
625628 '\'' => {
626629 let s = self . tokenize_quoted_string ( chars, '\'' ) ?;
@@ -714,7 +717,7 @@ impl<'a> Tokenizer<'a> {
714717
715718 // mysql dialect supports identifiers that start with a numeric prefix,
716719 // as long as they aren't an exponent number.
717- if dialect_of ! ( self is MySqlDialect ) && exponent_part. is_empty ( ) {
720+ if dialect_of ! ( self is MySqlDialect | HiveDialect ) && exponent_part. is_empty ( ) {
718721 let word =
719722 peeking_take_while ( chars, |ch| self . dialect . is_identifier_part ( ch) ) ;
720723
@@ -786,7 +789,18 @@ impl<'a> Tokenizer<'a> {
786789 }
787790 '+' => self . consume_and_return ( chars, Token :: Plus ) ,
788791 '*' => self . consume_and_return ( chars, Token :: Mul ) ,
789- '%' => self . consume_and_return ( chars, Token :: Mod ) ,
792+ '%' => {
793+ chars. next ( ) ;
794+ match chars. peek ( ) {
795+ Some ( ' ' ) => self . consume_and_return ( chars, Token :: Mod ) ,
796+ Some ( sch) if self . dialect . is_identifier_start ( '%' ) => {
797+ let mut s = ch. to_string ( ) ;
798+ s. push_str ( & sch. to_string ( ) ) ;
799+ self . tokenize_identifier_or_keyword ( s, chars)
800+ }
801+ _ => self . consume_and_return ( chars, Token :: Mod ) ,
802+ }
803+ }
790804 '|' => {
791805 chars. next ( ) ; // consume the '|'
792806 match chars. peek ( ) {
@@ -901,6 +915,12 @@ impl<'a> Tokenizer<'a> {
901915 _ => Ok ( Some ( Token :: HashArrow ) ) ,
902916 }
903917 }
918+ Some ( ' ' ) => Ok ( Some ( Token :: Sharp ) ) ,
919+ Some ( sch) if self . dialect . is_identifier_start ( '#' ) => {
920+ let mut s = ch. to_string ( ) ;
921+ s. push_str ( & sch. to_string ( ) ) ;
922+ self . tokenize_identifier_or_keyword ( s, chars)
923+ }
904924 _ => Ok ( Some ( Token :: Sharp ) ) ,
905925 }
906926 }
@@ -909,7 +929,25 @@ impl<'a> Tokenizer<'a> {
909929 match chars. peek ( ) {
910930 Some ( '>' ) => self . consume_and_return ( chars, Token :: AtArrow ) ,
911931 Some ( '?' ) => self . consume_and_return ( chars, Token :: AtQuestion ) ,
912- Some ( '@' ) => self . consume_and_return ( chars, Token :: AtAt ) ,
932+ Some ( '@' ) => {
933+ chars. next ( ) ;
934+ match chars. peek ( ) {
935+ Some ( ' ' ) => Ok ( Some ( Token :: AtAt ) ) ,
936+ Some ( tch) if self . dialect . is_identifier_start ( '@' ) => {
937+ let mut s = ch. to_string ( ) ;
938+ s. push ( '@' ) ;
939+ s. push_str ( & tch. to_string ( ) ) ;
940+ self . tokenize_identifier_or_keyword ( s, chars)
941+ }
942+ _ => Ok ( Some ( Token :: AtAt ) ) ,
943+ }
944+ }
945+ Some ( ' ' ) => Ok ( Some ( Token :: AtSign ) ) ,
946+ Some ( sch) if self . dialect . is_identifier_start ( '@' ) => {
947+ let mut s = ch. to_string ( ) ;
948+ s. push_str ( & sch. to_string ( ) ) ;
949+ self . tokenize_identifier_or_keyword ( s, chars)
950+ }
913951 _ => Ok ( Some ( Token :: AtSign ) ) ,
914952 }
915953 }
@@ -918,6 +956,11 @@ impl<'a> Tokenizer<'a> {
918956 let s = peeking_take_while ( chars, |ch| ch. is_numeric ( ) ) ;
919957 Ok ( Some ( Token :: Placeholder ( String :: from ( "?" ) + & s) ) )
920958 }
959+
960+ // identifier or keyword
961+ ch if self . dialect . is_identifier_start ( ch) => {
962+ self . tokenize_identifier_or_keyword ( ch. to_string ( ) , chars)
963+ }
921964 '$' => Ok ( Some ( self . tokenize_dollar_preceded_value ( chars) ?) ) ,
922965
923966 //whitespace check (including unicode chars) should be last as it covers some of the chars above
@@ -1043,8 +1086,8 @@ impl<'a> Tokenizer<'a> {
10431086 }
10441087
10451088 /// Tokenize an identifier or keyword, after the first char is already consumed.
1046- fn tokenize_word ( & self , first_char : char , chars : & mut State ) -> String {
1047- let mut s = first_char . to_string ( ) ;
1089+ fn tokenize_word ( & self , first_chars : String , chars : & mut State ) -> String {
1090+ let mut s = first_chars ;
10481091 s. push_str ( & peeking_take_while ( chars, |ch| {
10491092 self . dialect . is_identifier_part ( ch)
10501093 } ) ) ;
0 commit comments