11//! A parser of the ENBF-like grammar.
22
33use super :: { Characters , Expression , ExpressionKind , Grammar , Production } ;
4- use regex:: { Captures , Regex } ;
54use std:: fmt;
65use std:: fmt:: Display ;
76use std:: path:: Path ;
8- use std:: sync:: LazyLock ;
97
108struct Parser < ' a > {
119 input : & ' a str ,
@@ -76,18 +74,6 @@ impl Parser<'_> {
7674 & self . input [ i..i + upper]
7775 }
7876
79- /// If the input matches the given regex, it is returned and the head is moved forward.
80- ///
81- /// Note that regexes must start with `^`.
82- fn take_re ( & mut self , re : & Regex ) -> Option < Captures < ' _ > > {
83- if let Some ( cap) = re. captures ( & self . input [ self . index ..] ) {
84- self . index += cap[ 0 ] . len ( ) ;
85- Some ( cap)
86- } else {
87- None
88- }
89- }
90-
9177 /// Returns whether or not the given string is next, and advances the head if it is.
9278 fn take_str ( & mut self , s : & str ) -> bool {
9379 if self . input [ self . index ..] . starts_with ( s) {
@@ -168,13 +154,12 @@ impl Parser<'_> {
168154 }
169155
170156 fn parse_expression ( & mut self ) -> Result < Option < Expression > > {
171- static ALT_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^ *\| *" ) . unwrap ( ) ) ;
172-
173157 let mut es = Vec :: new ( ) ;
174158 loop {
175159 let Some ( e) = self . parse_seq ( ) ? else { break } ;
176160 es. push ( e) ;
177- if self . take_re ( & ALT_RE ) . is_none ( ) {
161+ _ = self . space0 ( ) ;
162+ if !self . take_str ( "|" ) {
178163 break ;
179164 }
180165 }
@@ -268,21 +253,28 @@ impl Parser<'_> {
268253 Some ( ExpressionKind :: Nt ( nt) )
269254 }
270255
256+ /// Parse terminal within backticks.
271257 fn parse_terminal ( & mut self ) -> Result < ExpressionKind > {
272- static TERMINAL_RE : LazyLock < Regex > =
273- LazyLock :: new ( || Regex :: new ( r"^`([^`\n]+)`" ) . unwrap ( ) ) ;
274- match self . take_re ( & TERMINAL_RE ) {
275- Some ( cap) => Ok ( ExpressionKind :: Terminal ( cap[ 1 ] . to_string ( ) ) ) ,
276- None => bail ! ( self , "unterminated terminal, expected closing backtick" ) ,
258+ Ok ( ExpressionKind :: Terminal ( self . parse_terminal_str ( ) ?) )
259+ }
260+
261+ /// Parse string within backticks.
262+ fn parse_terminal_str ( & mut self ) -> Result < String > {
263+ self . expect ( "`" , "expected opening backtick" ) ?;
264+ let term = self . take_while ( & |x| ![ '\n' , '`' ] . contains ( & x) ) . to_string ( ) ;
265+ if term. is_empty ( ) {
266+ bail ! ( self , "expected terminal" ) ;
277267 }
268+ self . expect ( "`" , "expected closing backtick" ) ?;
269+ Ok ( term)
278270 }
279271
280272 fn parse_charset ( & mut self ) -> Result < ExpressionKind > {
281273 self . expect ( "[" , "expected opening [" ) ?;
282274 let mut characters = Vec :: new ( ) ;
283275 loop {
284276 self . space0 ( ) ;
285- let Some ( ch) = self . parse_characters ( ) else {
277+ let Some ( ch) = self . parse_characters ( ) ? else {
286278 break ;
287279 } ;
288280 characters. push ( ch) ;
@@ -295,27 +287,48 @@ impl Parser<'_> {
295287 Ok ( ExpressionKind :: Charset ( characters) )
296288 }
297289
298- fn parse_characters ( & mut self ) -> Option < Characters > {
299- static RANGE_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^`(.)`-`(.)`" ) . unwrap ( ) ) ;
300- static TERMINAL_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( "^`([^`\n ]+)`" ) . unwrap ( ) ) ;
301- if let Some ( cap) = self . take_re ( & RANGE_RE ) {
302- let a = cap[ 1 ] . chars ( ) . next ( ) . unwrap ( ) ;
303- let b = cap[ 2 ] . chars ( ) . next ( ) . unwrap ( ) ;
304- Some ( Characters :: Range ( a, b) )
305- } else if let Some ( cap) = self . take_re ( & TERMINAL_RE ) {
306- Some ( Characters :: Terminal ( cap[ 1 ] . to_string ( ) ) )
290+ /// Parse an element of a character class, e.g.
291+ /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``.
292+ fn parse_characters ( & mut self ) -> Result < Option < Characters > > {
293+ if let Some ( b'`' ) = self . peek ( ) {
294+ let recov = self . index ;
295+ let a = self . parse_terminal_str ( ) ?;
296+ if self . take_str ( "-" ) {
297+ //~^ Parse `` `a`-`b` `` character range.
298+ if a. len ( ) > 1 {
299+ self . index = recov + 1 ;
300+ bail ! ( self , "invalid start terminal in range" ) ;
301+ }
302+ let recov = self . index ;
303+ let b = self . parse_terminal_str ( ) ?;
304+ if b. len ( ) > 1 {
305+ self . index = recov + 1 ;
306+ bail ! ( self , "invalid end terminal in range" ) ;
307+ }
308+ let a = a. chars ( ) . next ( ) . unwrap ( ) ;
309+ let b = b. chars ( ) . next ( ) . unwrap ( ) ;
310+ Ok ( Some ( Characters :: Range ( a, b) ) )
311+ } else {
312+ //~^ Parse terminal in backticks.
313+ Ok ( Some ( Characters :: Terminal ( a) ) )
314+ }
315+ } else if let Some ( name) = self . parse_name ( ) {
316+ //~^ Parse nonterminal identifier.
317+ Ok ( Some ( Characters :: Named ( name) ) )
307318 } else {
308- let name = self . parse_name ( ) ?;
309- Some ( Characters :: Named ( name) )
319+ Ok ( None )
310320 }
311321 }
312322
323+ /// Parse e.g. `<prose text>`.
313324 fn parse_prose ( & mut self ) -> Result < ExpressionKind > {
314- static PROSE_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^<([^>\n]+)>" ) . unwrap ( ) ) ;
315- match self . take_re ( & PROSE_RE ) {
316- Some ( cap ) => Ok ( ExpressionKind :: Prose ( cap [ 1 ] . to_string ( ) ) ) ,
317- None => bail ! ( self , "unterminated prose, expected closing `>`" ) ,
325+ self . expect ( "<" , "expected opening `<`" ) ? ;
326+ let text = self . take_while ( & |x| ! [ '\n' , '>' ] . contains ( & x ) ) . to_string ( ) ;
327+ if text . is_empty ( ) {
328+ bail ! ( self , "expected prose text" ) ;
318329 }
330+ self . expect ( ">" , "expected closing `>`" ) ?;
331+ Ok ( ExpressionKind :: Prose ( text) )
319332 }
320333
321334 fn parse_grouped ( & mut self ) -> Result < ExpressionKind > {
@@ -344,13 +357,19 @@ impl Parser<'_> {
344357 Ok ( ExpressionKind :: NegExpression ( box_kind ( kind) ) )
345358 }
346359
360+ /// Parse e.g. `F00F` after `U+`.
347361 fn parse_unicode ( & mut self ) -> Result < ExpressionKind > {
348- static UNICODE_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^[A-Z0-9]{4}" ) . unwrap ( ) ) ;
349-
350- match self . take_re ( & UNICODE_RE ) {
351- Some ( s) => Ok ( ExpressionKind :: Unicode ( s[ 0 ] . to_string ( ) ) ) ,
352- None => bail ! ( self , "expected 4 hexadecimal uppercase digits after U+" ) ,
362+ let mut xs = Vec :: with_capacity ( 4 ) ;
363+ for _ in 0 ..4 {
364+ match self . peek ( ) {
365+ Some ( x @ ( b'0' ..=b'9' | b'A' ..=b'F' ) ) => {
366+ xs. push ( x) ;
367+ self . index += 1 ;
368+ }
369+ _ => bail ! ( self , "expected 4 uppercase hexidecimal digits after `U+`" ) ,
370+ }
353371 }
372+ Ok ( ExpressionKind :: Unicode ( String :: from_utf8 ( xs) . unwrap ( ) ) )
354373 }
355374
356375 /// Parse `?` after expression.
@@ -428,16 +447,17 @@ impl Parser<'_> {
428447 Ok ( Some ( self . input [ start..self . index - 1 ] . to_string ( ) ) )
429448 }
430449
450+ /// Parse footnote reference, e.g. `[^id]`.
431451 fn parse_footnote ( & mut self ) -> Result < Option < String > > {
432- static FOOTNOTE_RE : LazyLock < Regex > =
433- LazyLock :: new ( || Regex :: new ( r"^([^\]\n]+)]" ) . unwrap ( ) ) ;
434452 if !self . take_str ( "[^" ) {
435453 return Ok ( None ) ;
436454 }
437- match self . take_re ( & FOOTNOTE_RE ) {
438- Some ( cap ) => Ok ( Some ( cap [ 1 ] . to_string ( ) ) ) ,
439- None => bail ! ( self , "unterminated footnote, expected closing `]`" ) ,
455+ let id = self . take_while ( & |x| ! [ '\n' , ']' ] . contains ( & x ) ) . to_string ( ) ;
456+ if id . is_empty ( ) {
457+ bail ! ( self , "expected footnote id" ) ;
440458 }
459+ self . expect ( "]" , "expected closing `]`" ) ?;
460+ Ok ( Some ( id) )
441461 }
442462}
443463
0 commit comments