Skip to content

Commit

Permalink
parser: add \u{} escape sequence for Unicode characters
Browse files Browse the repository at this point in the history
  • Loading branch information
erikgrinaker committed Jun 11, 2024
1 parent 686e261 commit 0b62d19
Show file tree
Hide file tree
Showing 10 changed files with 40 additions and 8 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
**Improvements**

* Add `\x` escape sequence for hex bytes.
* Add `\u{}` escape sequence for Unicode characters.
* Allow empty commands, keys, and prefixes.

# 0.5.0 (2024-05-31)
Expand Down
5 changes: 3 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -295,8 +295,9 @@
//! Strings can be quoted using `"` or `'`, in which case they can contain
//! arbitrary Unicode characters. `\` is used as an escape character, both to
//! escape quotes `\"` and `\'` as well as itself `\\`, and also `\0` (null),
//! `\n` (newline), `\r` (carriage return), and `\t` (tab). `\x` can be used
//! to represent arbitrary hexadecimal characters, e.g. `\x7a`.
//! `\n` (newline), `\r` (carriage return), and `\t` (tab). `\x` can be used to
//! represent arbitrary hexadecimal bytes (e.g. `\x7a`) and `\u{}` can be used
//! to represent arbitrary Unicode characters (e.g. `\u{1f44b}`)
//!
//! ```text
//! string
Expand Down
14 changes: 13 additions & 1 deletion src/parser.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::command::{Argument, Block, Command};

use nom::branch::alt;
use nom::bytes::complete::{escaped_transform, is_not, tag, take};
use nom::bytes::complete::{escaped_transform, is_not, tag, take, take_while_m_n};
use nom::character::complete::{
alphanumeric1, anychar, char, line_ending, not_line_ending, space0, space1,
};
Expand Down Expand Up @@ -201,6 +201,18 @@ fn quoted_string(quote: char) -> impl FnMut(Span) -> IResult<String> {
Err(_) => Err(Error::new(input, ErrorKind::HexDigit)),
},
),
map_res(
delimited(
tag("u{"),
take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit()),
tag("}"),
),
|input: Span| {
let codepoint = u32::from_str_radix(input.fragment(), 16)
.or(Err(Error::new(input, ErrorKind::HexDigit)))?;
char::from_u32(codepoint).ok_or(Error::new(input, ErrorKind::Char))
},
),
)),
),
tag(q),
Expand Down
3 changes: 3 additions & 0 deletions tests/errors/string_escape_unicode_char.error
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
parse error at line 2 column 1 for Tag:
'\u{fg}'
^
3 changes: 3 additions & 0 deletions tests/errors/string_escape_unicode_char.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# A Unicode escape with a non-hex character errors.
'\u{fg}'
---
3 changes: 3 additions & 0 deletions tests/errors/string_escape_unicode_empty.error
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
parse error at line 2 column 1 for Tag:
'\u{}'
^
3 changes: 3 additions & 0 deletions tests/errors/string_escape_unicode_empty.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# A Unicode escape with no digits errors.
'\u{}'
---
3 changes: 3 additions & 0 deletions tests/errors/string_escape_unicode_long.error
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
parse error at line 2 column 1 for Tag:
'\u{1234567}'
^
3 changes: 3 additions & 0 deletions tests/errors/string_escape_unicode_long.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# A Unicode escape with 7 digits errors.
'\u{1234567}'
---
10 changes: 5 additions & 5 deletions tests/scripts/strings
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@ command foo="" bar=''
---
Command { name: "command", args: [Argument { key: Some("foo"), value: "" }, Argument { key: Some("bar"), value: "" }], prefix: None, silent: false, fail: false, line_number: 39 }

# Escape sequences are respected both in single- and double-quotes, including
# Escape sequences are respected both in single and double quotes, including
# both quote types.
'\\ \' \" \0 \n \r \t \\ \x00 \x7A \xff'
"\\ \' \" \0 \n \r \t \\ \x00 \x7A \xff"
'\\ \' \" \0 \n \r \t \\ \x00 \x7A \xff \u{1F44b}'
"\\ \' \" \0 \n \r \t \\ \x00 \x7A \xff \u{1F44b}"
---
Command { name: "\\ ' \" \0 \n \r \t \\ \0 z ÿ", args: [], prefix: None, silent: false, fail: false, line_number: 45 }
Command { name: "\\ ' \" \0 \n \r \t \\ \0 z ÿ", args: [], prefix: None, silent: false, fail: false, line_number: 46 }
Command { name: "\\ ' \" \0 \n \r \t \\ \0 z ÿ 👋", args: [], prefix: None, silent: false, fail: false, line_number: 45 }
Command { name: "\\ ' \" \0 \n \r \t \\ \0 z ÿ 👋", args: [], prefix: None, silent: false, fail: false, line_number: 46 }

# Quoted strings can contain the other, unescaped quote kind.
'"'
Expand Down

0 comments on commit 0b62d19

Please sign in to comment.