From 0b62d19e48b14046d18c3796e6cd3c253ba53bb4 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 11 Jun 2024 11:59:06 +0200 Subject: [PATCH] parser: add `\u{}` escape sequence for Unicode characters --- CHANGELOG.md | 1 + src/lib.rs | 5 +++-- src/parser.rs | 14 +++++++++++++- tests/errors/string_escape_unicode_char.error | 3 +++ tests/errors/string_escape_unicode_char.in | 3 +++ tests/errors/string_escape_unicode_empty.error | 3 +++ tests/errors/string_escape_unicode_empty.in | 3 +++ tests/errors/string_escape_unicode_long.error | 3 +++ tests/errors/string_escape_unicode_long.in | 3 +++ tests/scripts/strings | 10 +++++----- 10 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 tests/errors/string_escape_unicode_char.error create mode 100644 tests/errors/string_escape_unicode_char.in create mode 100644 tests/errors/string_escape_unicode_empty.error create mode 100644 tests/errors/string_escape_unicode_empty.in create mode 100644 tests/errors/string_escape_unicode_long.error create mode 100644 tests/errors/string_escape_unicode_long.in diff --git a/CHANGELOG.md b/CHANGELOG.md index 7728f21..bc0c9fb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ **Improvements** * Add `\x` escape sequence for hex bytes. +* Add `\u{}` escape sequence for Unicode characters. * Allow empty commands, keys, and prefixes. # 0.5.0 (2024-05-31) diff --git a/src/lib.rs b/src/lib.rs index aee901c..bd2fd99 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -295,8 +295,9 @@ //! Strings can be quoted using `"` or `'`, in which case they can contain //! arbitrary Unicode characters. `\` is used as an escape character, both to //! escape quotes `\"` and `\'` as well as itself `\\`, and also `\0` (null), -//! `\n` (newline), `\r` (carriage return), and `\t` (tab). `\x` can be used -//! to represent arbitrary hexadecimal characters, e.g. `\x7a`. +//! `\n` (newline), `\r` (carriage return), and `\t` (tab). `\x` can be used to +//! represent arbitrary hexadecimal bytes (e.g. `\x7a`) and `\u{}` can be used +//! to represent arbitrary Unicode characters (e.g. `\u{1f44b}`) //! //! ```text //! string diff --git a/src/parser.rs b/src/parser.rs index fd07b40..e148ba0 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,7 +1,7 @@ use crate::command::{Argument, Block, Command}; use nom::branch::alt; -use nom::bytes::complete::{escaped_transform, is_not, tag, take}; +use nom::bytes::complete::{escaped_transform, is_not, tag, take, take_while_m_n}; use nom::character::complete::{ alphanumeric1, anychar, char, line_ending, not_line_ending, space0, space1, }; @@ -201,6 +201,18 @@ fn quoted_string(quote: char) -> impl FnMut(Span) -> IResult { Err(_) => Err(Error::new(input, ErrorKind::HexDigit)), }, ), + map_res( + delimited( + tag("u{"), + take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit()), + tag("}"), + ), + |input: Span| { + let codepoint = u32::from_str_radix(input.fragment(), 16) + .or(Err(Error::new(input, ErrorKind::HexDigit)))?; + char::from_u32(codepoint).ok_or(Error::new(input, ErrorKind::Char)) + }, + ), )), ), tag(q), diff --git a/tests/errors/string_escape_unicode_char.error b/tests/errors/string_escape_unicode_char.error new file mode 100644 index 0000000..78ad6ed --- /dev/null +++ b/tests/errors/string_escape_unicode_char.error @@ -0,0 +1,3 @@ +parse error at line 2 column 1 for Tag: +'\u{fg}' +^ \ No newline at end of file diff --git a/tests/errors/string_escape_unicode_char.in b/tests/errors/string_escape_unicode_char.in new file mode 100644 index 0000000..5933a56 --- /dev/null +++ b/tests/errors/string_escape_unicode_char.in @@ -0,0 +1,3 @@ +# A Unicode escape with a non-hex character errors. +'\u{fg}' +--- \ No newline at end of file diff --git a/tests/errors/string_escape_unicode_empty.error b/tests/errors/string_escape_unicode_empty.error new file mode 100644 index 0000000..ba91662 --- /dev/null +++ b/tests/errors/string_escape_unicode_empty.error @@ -0,0 +1,3 @@ +parse error at line 2 column 1 for Tag: +'\u{}' +^ \ No newline at end of file diff --git a/tests/errors/string_escape_unicode_empty.in b/tests/errors/string_escape_unicode_empty.in new file mode 100644 index 0000000..cfbe9d2 --- /dev/null +++ b/tests/errors/string_escape_unicode_empty.in @@ -0,0 +1,3 @@ +# A Unicode escape with no digits errors. +'\u{}' +--- \ No newline at end of file diff --git a/tests/errors/string_escape_unicode_long.error b/tests/errors/string_escape_unicode_long.error new file mode 100644 index 0000000..8280197 --- /dev/null +++ b/tests/errors/string_escape_unicode_long.error @@ -0,0 +1,3 @@ +parse error at line 2 column 1 for Tag: +'\u{1234567}' +^ \ No newline at end of file diff --git a/tests/errors/string_escape_unicode_long.in b/tests/errors/string_escape_unicode_long.in new file mode 100644 index 0000000..57267e4 --- /dev/null +++ b/tests/errors/string_escape_unicode_long.in @@ -0,0 +1,3 @@ +# A Unicode escape with 7 digits errors. +'\u{1234567}' +--- \ No newline at end of file diff --git a/tests/scripts/strings b/tests/scripts/strings index 3b46beb..3b607cb 100644 --- a/tests/scripts/strings +++ b/tests/scripts/strings @@ -40,13 +40,13 @@ command foo="" bar='' --- Command { name: "command", args: [Argument { key: Some("foo"), value: "" }, Argument { key: Some("bar"), value: "" }], prefix: None, silent: false, fail: false, line_number: 39 } -# Escape sequences are respected both in single- and double-quotes, including +# Escape sequences are respected both in single and double quotes, including # both quote types. -'\\ \' \" \0 \n \r \t \\ \x00 \x7A \xff' -"\\ \' \" \0 \n \r \t \\ \x00 \x7A \xff" +'\\ \' \" \0 \n \r \t \\ \x00 \x7A \xff \u{1F44b}' +"\\ \' \" \0 \n \r \t \\ \x00 \x7A \xff \u{1F44b}" --- -Command { name: "\\ ' \" \0 \n \r \t \\ \0 z ÿ", args: [], prefix: None, silent: false, fail: false, line_number: 45 } -Command { name: "\\ ' \" \0 \n \r \t \\ \0 z ÿ", args: [], prefix: None, silent: false, fail: false, line_number: 46 } +Command { name: "\\ ' \" \0 \n \r \t \\ \0 z ÿ 👋", args: [], prefix: None, silent: false, fail: false, line_number: 45 } +Command { name: "\\ ' \" \0 \n \r \t \\ \0 z ÿ 👋", args: [], prefix: None, silent: false, fail: false, line_number: 46 } # Quoted strings can contain the other, unescaped quote kind. '"'