Skip to content

Commit

Permalink
Implement RFC 3349, mixed utf8 literals.
Browse files Browse the repository at this point in the history
Specifically:
- Allow unicode chars in b"" and br"" literals. This is done by changing
  `Mode::allow_unicode_chars` to succeed on `ByteStr` and `RawByteStr`.
- Allow unicode escapes in b"" literals. This is done by changing
  `Mode::allow_unicode_escapes` to succeed on `ByteStr`.

Byte string literals can already have high bytes (`\x80`..`\xff`).
Because they now also support unicode chars, they can now be mixed utf8,
so we use `unescape_mixed`/`cook_mixed` instead of
`unescape_unicode`/`cook_unicode` to process them.

A new type `Rfc3349`, is used to implement the feature gating. Values of
that type are threaded through the unescaping code to track whether
rules from rfc3349 are required for unescaping to succeed.

Test changes

XXX: not sure about the latter three; could just move them into
accepting tests

- tests/ui/attributes/key-value-non-ascii.rs: changed from a byte string
  literal to a byte literal; we just need some kind of problem with a
  literal to preserve the test's intent.

- tests/ui/parser/raw/raw-byte-string-literals.rs: removed the raw byte
  string literal with a non-ASCII char. The other lexing errors meant
  that the feature gate warning wasn't occurring anyway, because
  compilation was aborting too early. No great loss, because we'll test
  far more complex cases in `tests/ui/mixed-utf8-literals/`.

- tests/ui/parser/byte-string-literals.rs: similar.

- tests/ui/parser/issues/issue-23620-invalid-escapes.rs: left the test
  unchanged; two old `unicode escape in byte string` errors are now
  `mixed utf8 b"" and br"" literals are experimental` errors.

- tests/ui/parser/unicode-control-codepoints.rs: similar.

- tests/ui/suggestions/multibyte-escapes.rs: similar.

- compiler/rustc_lexer/src/unescape/tests.rs: two cases that previously
  failed now succed.

XXX: not sure how to handle rust-analyzer, just allowed mixed utf8
literals everywhere without complaint
  • Loading branch information
nnethercote committed Jan 24, 2024
1 parent 6077f82 commit 8b2b3a7
Show file tree
Hide file tree
Showing 29 changed files with 314 additions and 230 deletions.
16 changes: 10 additions & 6 deletions compiler/rustc_ast/src/util/literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
use crate::token::{self, Token};
use rustc_lexer::unescape::{
byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
};
use rustc_span::symbol::{kw, sym, Symbol};
use rustc_span::Span;
Expand Down Expand Up @@ -49,7 +49,8 @@ impl LitKind {

// For byte/char/string literals, chars and escapes have already been
// checked in the lexer (in `cook_lexer_literal`). So we can assume all
// chars and escapes are valid here.
// chars and escapes are valid here, and ignore `Rfc3349` return
// values.
Ok(match kind {
token::Bool => {
assert!(symbol.is_bool_lit());
Expand Down Expand Up @@ -84,7 +85,7 @@ impl LitKind {
// Force-inlining here is aggressive but the closure is
// called on every char in the string, so it can be hot in
// programs with many long strings containing escapes.
unescape_unicode(
_ = unescape_unicode(
s,
Mode::Str,
&mut #[inline(always)]
Expand All @@ -108,8 +109,11 @@ impl LitKind {
token::ByteStr => {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c {
Ok(c) => buf.push(byte_from_char(c)),
_ = unescape_mixed(s, Mode::ByteStr, &mut |_, c| match c {
Ok(MixedUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Ok(MixedUnit::HighByte(b)) => buf.push(b),
Err(err) => {
assert!(!err.is_fatal(), "failed to unescape string literal")
}
Expand All @@ -125,7 +129,7 @@ impl LitKind {
token::CStr => {
let s = symbol.as_str();
let mut buf = Vec::with_capacity(s.len());
unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
_ = unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
Ok(MixedUnit::Char(c)) => {
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
}
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_ast_passes/src/feature_gate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ pub fn check_crate(krate: &ast::Crate, sess: &Session, features: &Features) {
}
};
}
gate_all!(mixed_utf8_literals, r#"mixed utf8 b"" and br"" literals are experimental"#);
gate_all!(
if_let_guard,
"`if let` guards are experimental",
Expand Down
2 changes: 2 additions & 0 deletions compiler/rustc_feature/src/unstable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,8 @@ declare_features! (
/// standard library until the soundness issues with specialization
/// are fixed.
(unstable, min_specialization, "1.7.0", Some(31844)),
/// Allows mixed utf8 b"" and br"" literals.
(unstable, mixed_utf8_literals, "CURRENT_RUSTC_VERSION", Some(116907)),
/// Allows qualified paths in struct expressions, struct patterns and tuple struct patterns.
(unstable, more_qualified_paths, "1.54.0", Some(86935)),
/// Allows the `#[must_not_suspend]` attribute.
Expand Down
93 changes: 68 additions & 25 deletions compiler/rustc_lexer/src/unescape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ use Mode::*;
#[cfg(test)]
mod tests;

// njn: need to add tests in tests/ui/mixed-utf8-literals/; see
// tests/ui/try-block/ for an example to follow

/// Errors and warnings that can occur during string unescaping. They mostly
/// relate to malformed escape sequences, but there are a few that are about
/// other problems.
Expand Down Expand Up @@ -85,7 +88,7 @@ impl EscapeError {
///
/// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
/// the callback will be called exactly once.
pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
Expand All @@ -94,16 +97,17 @@ where
let mut chars = src.chars();
let res = unescape_char_or_byte(&mut chars, mode);
callback(0..(src.len() - chars.as_str().len()), res);
Rfc3349::Unused // rfc3349 is not relevant for char or byte literals
}
Str | ByteStr => unescape_non_raw_common(src, mode, callback),
Str => unescape_non_raw_common(src, mode, callback),
RawStr | RawByteStr => check_raw_common(src, mode, callback),
RawCStr => check_raw_common(src, mode, &mut |r, mut result| {
if let Ok('\0') = result {
result = Err(EscapeError::NulInCStr);
}
callback(r, result)
}),
CStr => unreachable!(),
ByteStr | CStr => unreachable!(),
}
}

Expand Down Expand Up @@ -142,18 +146,19 @@ impl From<u8> for MixedUnit {
/// a sequence of escaped characters or errors.
///
/// Values are returned by invoking `callback`.
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
where
F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
{
match mode {
ByteStr => unescape_non_raw_common(src, mode, callback),
CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| {
if let Ok(MixedUnit::Char('\0')) = result {
result = Err(EscapeError::NulInCStr);
}
callback(r, result)
}),
Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
Char | Byte | Str | RawStr | RawByteStr | RawCStr => unreachable!(),
}
}

Expand All @@ -169,6 +174,15 @@ pub fn unescape_byte(src: &str) -> Result<u8, EscapeError> {
unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char)
}

/// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the
/// literal to be valid. Once rfc3349 is stabilized this type can be removed.
#[derive(Debug, PartialEq)]
#[must_use]
pub enum Rfc3349 {
Used,
Unused,
}

/// What kind of literal do we parse.
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum Mode {
Expand Down Expand Up @@ -205,17 +219,25 @@ impl Mode {

/// Are unicode (non-ASCII) chars allowed?
#[inline]
fn allow_unicode_chars(self) -> bool {
fn allow_unicode_chars(self, rfc3349: &mut Rfc3349) -> bool {
match self {
Byte | ByteStr | RawByteStr => false,
Byte => false,
ByteStr | RawByteStr => {
*rfc3349 = Rfc3349::Used;
true
}
Char | Str | RawStr | CStr | RawCStr => true,
}
}

/// Are unicode escapes (`\u`) allowed?
fn allow_unicode_escapes(self) -> bool {
fn allow_unicode_escapes(self, rfc3349: &mut Rfc3349) -> bool {
match self {
Byte | ByteStr => false,
Byte => false,
ByteStr => {
*rfc3349 = Rfc3349::Used;
true
}
Char | Str | CStr => true,
RawByteStr | RawStr | RawCStr => unreachable!(),
}
Expand All @@ -233,6 +255,7 @@ impl Mode {
fn scan_escape<T: From<char> + From<u8>>(
chars: &mut Chars<'_>,
mode: Mode,
rfc3349: &mut Rfc3349,
) -> Result<T, EscapeError> {
// Previous character was '\\', unescape what follows.
let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
Expand Down Expand Up @@ -262,13 +285,17 @@ fn scan_escape<T: From<char> + From<u8>>(
Ok(T::from(value as u8))
};
}
'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
'u' => return scan_unicode(chars, mode, rfc3349).map(T::from),
_ => return Err(EscapeError::InvalidEscape),
};
Ok(T::from(res))
}

fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
fn scan_unicode(
chars: &mut Chars<'_>,
mode: Mode,
rfc3349: &mut Rfc3349,
) -> Result<char, EscapeError> {
// We've parsed '\u', now we have to parse '{..}'.

if chars.next() != Some('{') {
Expand Down Expand Up @@ -296,7 +323,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch

// Incorrect syntax has higher priority for error reporting
// than unallowed value for a literal.
if !allow_unicode_escapes {
if !mode.allow_unicode_escapes(rfc3349) {
return Err(EscapeError::UnicodeEscapeInByte);
}

Expand All @@ -322,18 +349,28 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<ch
}

#[inline]
fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> {
if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) }
fn ascii_check(c: char, mode: Mode, rfc3349: &mut Rfc3349) -> Result<char, EscapeError> {
// We must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily.
if c.is_ascii() || mode.allow_unicode_chars(rfc3349) {
Ok(c)
} else {
Err(EscapeError::NonAsciiCharInByte)
}
}

fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
let c = chars.next().ok_or(EscapeError::ZeroChars)?;
let mut rfc3349 = Rfc3349::Unused;
let res = match c {
'\\' => scan_escape(chars, mode),
'\\' => scan_escape(chars, mode, &mut rfc3349),
'\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(c, mode.allow_unicode_chars()),
_ => ascii_check(c, mode, &mut rfc3349),
}?;

// rfc3349 is not relevant for char or byte literals.
assert_eq!(rfc3349, Rfc3349::Unused);

if chars.next().is_some() {
return Err(EscapeError::MoreThanOneChar);
}
Expand All @@ -342,12 +379,16 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca

/// Takes a contents of a string literal (without quotes) and produces a
/// sequence of escaped characters or errors.
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
fn unescape_non_raw_common<F, T: From<char> + From<u8>>(
src: &str,
mode: Mode,
callback: &mut F,
) -> Rfc3349
where
F: FnMut(Range<usize>, Result<T, EscapeError>),
{
let mut chars = src.chars();
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
let mut rfc3349 = Rfc3349::Unused;

// The `start` and `end` computation here is complicated because
// `skip_ascii_whitespace` makes us to skip over chars without counting
Expand All @@ -367,16 +408,17 @@ where
});
continue;
}
_ => scan_escape::<T>(&mut chars, mode),
_ => scan_escape::<T>(&mut chars, mode, &mut rfc3349),
}
}
'"' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(EscapeError::BareCarriageReturn),
_ => ascii_check(c, allow_unicode_chars).map(T::from),
_ => ascii_check(c, mode, &mut rfc3349).map(T::from),
};
let end = src.len() - chars.as_str().len();
callback(start..end, res);
}
rfc3349
}

fn skip_ascii_whitespace<F>(chars: &mut Chars<'_>, start: usize, callback: &mut F)
Expand Down Expand Up @@ -409,12 +451,12 @@ where
/// sequence of characters or errors.
/// NOTE: Raw strings do not perform any explicit character escaping, here we
/// only produce errors on bare CR.
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F) -> Rfc3349
where
F: FnMut(Range<usize>, Result<char, EscapeError>),
{
let mut chars = src.chars();
let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
let mut rfc3349 = Rfc3349::Unused;

// The `start` and `end` computation here matches the one in
// `unescape_non_raw_common` for consistency, even though this function
Expand All @@ -423,16 +465,17 @@ where
let start = src.len() - chars.as_str().len() - c.len_utf8();
let res = match c {
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
_ => ascii_check(c, allow_unicode_chars),
_ => ascii_check(c, mode, &mut rfc3349),
};
let end = src.len() - chars.as_str().len();
callback(start..end, res);
}
rfc3349
}

#[inline]
pub fn byte_from_char(c: char) -> u8 {
pub(crate) fn byte_from_char(c: char) -> u8 {
let res = c as u32;
debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
debug_assert!(res <= u8::MAX as u32, "guaranteed because of Byte");
res as u8
}
Loading

0 comments on commit 8b2b3a7

Please sign in to comment.