From f2dee333120a2c39236e3d52ba9b0b1e49cbe345 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 24 Jan 2024 16:36:22 +1100 Subject: [PATCH] Implement RFC 3349, mixed utf8 literals. Specifically: - Allow unicode chars in b"" and br"" literals. This is done by changing `Mode::allow_unicode_chars` to succeed on `ByteStr` and `RawByteStr`. - Allow unicode escapes in b"" literals. This is done by changing `Mode::allow_unicode_escapes` to succeed on `ByteStr`. Byte string literals can already have high bytes (`\x80`..`\xff`). Because they now also support unicode chars, they can now be mixed utf8, so we use `unescape_mixed`/`cook_mixed` instead of `unescape_unicode`/`cook_unicode` to process them. A new type `Rfc3349`, is used to implement the feature gating. Values of that type are threaded through the unescaping code to track whether rules from rfc3349 are required for unescaping to succeed. Test changes XXX: not sure about the latter three; could just move them into accepting tests - tests/ui/attributes/key-value-non-ascii.rs: changed from a byte string literal to a byte literal; we just need some kind of problem with a literal to preserve the test's intent. - tests/ui/parser/raw/raw-byte-string-literals.rs: removed the raw byte string literal with a non-ASCII char. The other lexing errors meant that the feature gate warning wasn't occurring anyway, because compilation was aborting too early. No great loss, because we'll test far more complex cases in `tests/ui/mixed-utf8-literals/`. - tests/ui/parser/byte-string-literals.rs: similar. - tests/ui/parser/issues/issue-23620-invalid-escapes.rs: left the test unchanged; two old `unicode escape in byte string` errors are now `mixed utf8 b"" and br"" literals are experimental` errors. - tests/ui/parser/unicode-control-codepoints.rs: similar. - tests/ui/suggestions/multibyte-escapes.rs: similar. XXX: not sure how to handle rust-analyzer, just allowed mixed utf8 literals everywhere without complaint --- compiler/rustc_ast/src/util/literal.rs | 16 ++- compiler/rustc_ast_passes/src/feature_gate.rs | 1 + compiler/rustc_feature/src/unstable.rs | 2 + compiler/rustc_lexer/src/unescape.rs | 93 +++++++++++----- compiler/rustc_lexer/src/unescape/tests.rs | 32 ++++-- compiler/rustc_parse/messages.ftl | 4 + compiler/rustc_parse/src/lexer/mod.rs | 14 ++- .../src/lexer/unescape_error_reporting.rs | 8 +- compiler/rustc_parse_format/src/lib.rs | 4 +- compiler/rustc_span/src/symbol.rs | 1 + .../language-features/mixed-utf8-literals.md | 9 ++ .../clippy/clippy_dev/src/update_lints.rs | 2 +- .../crates/parser/src/lexed_str.rs | 14 ++- .../crates/syntax/src/ast/token_ext.rs | 52 +++++++-- .../crates/syntax/src/validation.rs | 12 +- tests/ui/attributes/key-value-non-ascii.rs | 2 +- .../ui/attributes/key-value-non-ascii.stderr | 14 +-- .../feature-gate-mixed-utf8-literals.rs | 5 + .../feature-gate-mixed-utf8-literals.stderr | 33 ++++++ tests/ui/parser/byte-string-literals.rs | 2 - tests/ui/parser/byte-string-literals.stderr | 21 +--- .../issues/issue-23620-invalid-escapes.rs | 4 +- .../issues/issue-23620-invalid-escapes.stderr | 37 +++--- .../ui/parser/raw/raw-byte-string-literals.rs | 1 - .../raw/raw-byte-string-literals.stderr | 10 +- tests/ui/parser/unicode-control-codepoints.rs | 13 +-- .../parser/unicode-control-codepoints.stderr | 105 +++++------------- tests/ui/suggestions/multibyte-escapes.rs | 7 +- tests/ui/suggestions/multibyte-escapes.stderr | 14 +-- 29 files changed, 306 insertions(+), 226 deletions(-) create mode 100644 src/doc/unstable-book/src/language-features/mixed-utf8-literals.md create mode 100644 tests/ui/feature-gates/feature-gate-mixed-utf8-literals.rs create mode 100644 tests/ui/feature-gates/feature-gate-mixed-utf8-literals.stderr diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index aaeb1bb9bff82..6ea42da45438c 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -3,7 +3,7 @@ use crate::ast::{self, LitKind, MetaItemLit, StrStyle}; use crate::token::{self, Token}; use rustc_lexer::unescape::{ - byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode, + unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode, }; use rustc_span::symbol::{kw, sym, Symbol}; use rustc_span::Span; @@ -49,7 +49,8 @@ impl LitKind { // For byte/char/string literals, chars and escapes have already been // checked in the lexer (in `cook_lexer_literal`). So we can assume all - // chars and escapes are valid here. + // chars and escapes are valid here, and ignore `Rfc3349` return + // values. Ok(match kind { token::Bool => { assert!(symbol.is_bool_lit()); @@ -84,7 +85,7 @@ impl LitKind { // Force-inlining here is aggressive but the closure is // called on every char in the string, so it can be hot in // programs with many long strings containing escapes. - unescape_unicode( + _ = unescape_unicode( s, Mode::Str, &mut #[inline(always)] @@ -108,8 +109,11 @@ impl LitKind { token::ByteStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c { - Ok(c) => buf.push(byte_from_char(c)), + _ = unescape_mixed(s, Mode::ByteStr, &mut |_, c| match c { + Ok(MixedUnit::Char(c)) => { + buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) + } + Ok(MixedUnit::HighByte(b)) => buf.push(b), Err(err) => { assert!(!err.is_fatal(), "failed to unescape string literal") } @@ -125,7 +129,7 @@ impl LitKind { token::CStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_mixed(s, Mode::CStr, &mut |_span, c| match c { + _ = unescape_mixed(s, Mode::CStr, &mut |_span, c| match c { Ok(MixedUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } diff --git a/compiler/rustc_ast_passes/src/feature_gate.rs b/compiler/rustc_ast_passes/src/feature_gate.rs index 82236d2e30678..5723b989d1b28 100644 --- a/compiler/rustc_ast_passes/src/feature_gate.rs +++ b/compiler/rustc_ast_passes/src/feature_gate.rs @@ -508,6 +508,7 @@ pub fn check_crate(krate: &ast::Crate, sess: &Session, features: &Features) { } }; } + gate_all!(mixed_utf8_literals, r#"mixed utf8 b"" and br"" literals are experimental"#); gate_all!( if_let_guard, "`if let` guards are experimental", diff --git a/compiler/rustc_feature/src/unstable.rs b/compiler/rustc_feature/src/unstable.rs index 6eed2178ead8d..55a8ad1e5aa3d 100644 --- a/compiler/rustc_feature/src/unstable.rs +++ b/compiler/rustc_feature/src/unstable.rs @@ -520,6 +520,8 @@ declare_features! ( /// standard library until the soundness issues with specialization /// are fixed. (unstable, min_specialization, "1.7.0", Some(31844)), + /// Allows mixed utf8 b"" and br"" literals. + (unstable, mixed_utf8_literals, "CURRENT_RUSTC_VERSION", Some(116907)), /// Allows qualified paths in struct expressions, struct patterns and tuple struct patterns. (unstable, more_qualified_paths, "1.54.0", Some(86935)), /// Allows the `#[must_not_suspend]` attribute. diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 03d178eb266a4..ceb9e58713633 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -9,6 +9,9 @@ use Mode::*; #[cfg(test)] mod tests; +// njn: need to add tests in tests/ui/mixed-utf8-literals/; see +// tests/ui/try-block/ for an example to follow + /// Errors and warnings that can occur during string unescaping. They mostly /// relate to malformed escape sequences, but there are a few that are about /// other problems. @@ -85,7 +88,7 @@ impl EscapeError { /// /// Values are returned by invoking `callback`. For `Char` and `Byte` modes, /// the callback will be called exactly once. -pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) +pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) -> Rfc3349 where F: FnMut(Range, Result), { @@ -94,8 +97,9 @@ where let mut chars = src.chars(); let res = unescape_char_or_byte(&mut chars, mode); callback(0..(src.len() - chars.as_str().len()), res); + Rfc3349::Unused // rfc3349 is not relevant for char or byte literals } - Str | ByteStr => unescape_non_raw_common(src, mode, callback), + Str => unescape_non_raw_common(src, mode, callback), RawStr | RawByteStr => check_raw_common(src, mode, callback), RawCStr => check_raw_common(src, mode, &mut |r, mut result| { if let Ok('\0') = result { @@ -103,7 +107,7 @@ where } callback(r, result) }), - CStr => unreachable!(), + ByteStr | CStr => unreachable!(), } } @@ -142,18 +146,19 @@ impl From for MixedUnit { /// a sequence of escaped characters or errors. /// /// Values are returned by invoking `callback`. -pub fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) +pub fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) -> Rfc3349 where F: FnMut(Range, Result), { match mode { + ByteStr => unescape_non_raw_common(src, mode, callback), CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| { if let Ok(MixedUnit::Char('\0')) = result { result = Err(EscapeError::NulInCStr); } callback(r, result) }), - Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(), + Char | Byte | Str | RawStr | RawByteStr | RawCStr => unreachable!(), } } @@ -169,6 +174,15 @@ pub fn unescape_byte(src: &str) -> Result { unescape_char_or_byte(&mut src.chars(), Byte).map(byte_from_char) } +/// Used to indicate if rfc3349 (mixed-utf8-literals) was required for the +/// literal to be valid. Once rfc3349 is stabilized this type can be removed. +#[derive(Debug, PartialEq)] +#[must_use] +pub enum Rfc3349 { + Used, + Unused, +} + /// What kind of literal do we parse. #[derive(Debug, Clone, Copy, PartialEq)] pub enum Mode { @@ -205,17 +219,25 @@ impl Mode { /// Are unicode (non-ASCII) chars allowed? #[inline] - fn allow_unicode_chars(self) -> bool { + fn allow_unicode_chars(self, rfc3349: &mut Rfc3349) -> bool { match self { - Byte | ByteStr | RawByteStr => false, + Byte => false, + ByteStr | RawByteStr => { + *rfc3349 = Rfc3349::Used; + true + } Char | Str | RawStr | CStr | RawCStr => true, } } /// Are unicode escapes (`\u`) allowed? - fn allow_unicode_escapes(self) -> bool { + fn allow_unicode_escapes(self, rfc3349: &mut Rfc3349) -> bool { match self { - Byte | ByteStr => false, + Byte => false, + ByteStr => { + *rfc3349 = Rfc3349::Used; + true + } Char | Str | CStr => true, RawByteStr | RawStr | RawCStr => unreachable!(), } @@ -233,6 +255,7 @@ impl Mode { fn scan_escape + From>( chars: &mut Chars<'_>, mode: Mode, + rfc3349: &mut Rfc3349, ) -> Result { // Previous character was '\\', unescape what follows. let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? { @@ -262,13 +285,17 @@ fn scan_escape + From>( Ok(T::from(value as u8)) }; } - 'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from), + 'u' => return scan_unicode(chars, mode, rfc3349).map(T::from), _ => return Err(EscapeError::InvalidEscape), }; Ok(T::from(res)) } -fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result { +fn scan_unicode( + chars: &mut Chars<'_>, + mode: Mode, + rfc3349: &mut Rfc3349, +) -> Result { // We've parsed '\u', now we have to parse '{..}'. if chars.next() != Some('{') { @@ -296,7 +323,7 @@ fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result, allow_unicode_escapes: bool) -> Result Result { - if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) } +fn ascii_check(c: char, mode: Mode, rfc3349: &mut Rfc3349) -> Result { + // We must check `is_ascii` first, to avoid setting `rfc3349` unnecessarily. + if c.is_ascii() || mode.allow_unicode_chars(rfc3349) { + Ok(c) + } else { + Err(EscapeError::NonAsciiCharInByte) + } } fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { let c = chars.next().ok_or(EscapeError::ZeroChars)?; + let mut rfc3349 = Rfc3349::Unused; let res = match c { - '\\' => scan_escape(chars, mode), + '\\' => scan_escape(chars, mode, &mut rfc3349), '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, mode.allow_unicode_chars()), + _ => ascii_check(c, mode, &mut rfc3349), }?; + + // rfc3349 is not relevant for char or byte literals. + assert_eq!(rfc3349, Rfc3349::Unused); + if chars.next().is_some() { return Err(EscapeError::MoreThanOneChar); } @@ -342,12 +379,16 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result + From>(src: &str, mode: Mode, callback: &mut F) +fn unescape_non_raw_common + From>( + src: &str, + mode: Mode, + callback: &mut F, +) -> Rfc3349 where F: FnMut(Range, Result), { let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop + let mut rfc3349 = Rfc3349::Unused; // The `start` and `end` computation here is complicated because // `skip_ascii_whitespace` makes us to skip over chars without counting @@ -367,16 +408,17 @@ where }); continue; } - _ => scan_escape::(&mut chars, mode), + _ => scan_escape::(&mut chars, mode, &mut rfc3349), } } '"' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, allow_unicode_chars).map(T::from), + _ => ascii_check(c, mode, &mut rfc3349).map(T::from), }; let end = src.len() - chars.as_str().len(); callback(start..end, res); } + rfc3349 } fn skip_ascii_whitespace(chars: &mut Chars<'_>, start: usize, callback: &mut F) @@ -409,12 +451,12 @@ where /// sequence of characters or errors. /// NOTE: Raw strings do not perform any explicit character escaping, here we /// only produce errors on bare CR. -fn check_raw_common(src: &str, mode: Mode, callback: &mut F) +fn check_raw_common(src: &str, mode: Mode, callback: &mut F) -> Rfc3349 where F: FnMut(Range, Result), { let mut chars = src.chars(); - let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop + let mut rfc3349 = Rfc3349::Unused; // The `start` and `end` computation here matches the one in // `unescape_non_raw_common` for consistency, even though this function @@ -423,16 +465,17 @@ where let start = src.len() - chars.as_str().len() - c.len_utf8(); let res = match c { '\r' => Err(EscapeError::BareCarriageReturnInRawString), - _ => ascii_check(c, allow_unicode_chars), + _ => ascii_check(c, mode, &mut rfc3349), }; let end = src.len() - chars.as_str().len(); callback(start..end, res); } + rfc3349 } #[inline] -pub fn byte_from_char(c: char) -> u8 { +pub(crate) fn byte_from_char(c: char) -> u8 { let res = c as u32; - debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); + debug_assert!(res <= u8::MAX as u32, "guaranteed because of Byte"); res as u8 } diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs index 5b99495f47581..18b18e6ad0682 100644 --- a/compiler/rustc_lexer/src/unescape/tests.rs +++ b/compiler/rustc_lexer/src/unescape/tests.rs @@ -100,7 +100,9 @@ fn test_unescape_char_good() { fn test_unescape_str_warn() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res))); + let rfc3349 = + unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res))); + assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::Str` assert_eq!(unescaped, expected); } @@ -124,7 +126,7 @@ fn test_unescape_str_warn() { fn test_unescape_str_good() { fn check(literal_text: &str, expected: &str) { let mut buf = Ok(String::with_capacity(literal_text.len())); - unescape_unicode(literal_text, Mode::Str, &mut |range, c| { + let rfc3349 = unescape_unicode(literal_text, Mode::Str, &mut |range, c| { if let Ok(b) = &mut buf { match c { Ok(c) => b.push(c), @@ -132,6 +134,7 @@ fn test_unescape_str_good() { } } }); + assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::Str` assert_eq!(buf.as_deref(), Ok(expected)) } @@ -240,16 +243,20 @@ fn test_unescape_byte_good() { #[test] fn test_unescape_byte_str_good() { fn check(literal_text: &str, expected: &[u8]) { - let mut buf = Ok(Vec::with_capacity(literal_text.len())); - unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| { - if let Ok(b) = &mut buf { + let mut buf_res = Ok(Vec::with_capacity(literal_text.len())); + let rfc3349 = unescape_mixed(literal_text, Mode::ByteStr, &mut |range, c| { + if let Ok(buf) = &mut buf_res { match c { - Ok(c) => b.push(byte_from_char(c)), - Err(e) => buf = Err((range, e)), + Ok(MixedUnit::Char(c)) => { + buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) + } + Ok(MixedUnit::HighByte(b)) => buf.push(b), + Err(e) => buf_res = Err((range, e)), } } }); - assert_eq!(buf.as_deref(), Ok(expected)) + assert_eq!(rfc3349, Rfc3349::Unused); // njn: should have examples where this isn't true + assert_eq!(buf_res.as_deref(), Ok(expected)) } check("foo", b"foo"); @@ -264,7 +271,9 @@ fn test_unescape_byte_str_good() { fn test_unescape_raw_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res))); + let rfc3349 = + unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res))); + assert_eq!(rfc3349, Rfc3349::Unused); // rfc3349 not relevant for `Mode::RawStr` assert_eq!(unescaped, expected); } @@ -276,7 +285,10 @@ fn test_unescape_raw_str() { fn test_unescape_raw_byte_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res))); + let rfc3349 = unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| { + unescaped.push((range, res)) + }); + assert_eq!(rfc3349, Rfc3349::Unused); // njn: todo assert_eq!(unescaped, expected); } diff --git a/compiler/rustc_parse/messages.ftl b/compiler/rustc_parse/messages.ftl index f904e0c44ea92..4c26a81db046a 100644 --- a/compiler/rustc_parse/messages.ftl +++ b/compiler/rustc_parse/messages.ftl @@ -814,6 +814,10 @@ parse_unexpected_vert_vert_before_function_parameter = unexpected `||` before fu parse_unexpected_vert_vert_in_pattern = unexpected token `||` in pattern .suggestion = use a single `|` to separate multiple alternative patterns +# njn: +# - b'\u{1234}' error says "unicode escape in byte string", should be "byte literal" +# - after rfc3349 stabilizes, byte literal wil be the only error case here +# - could add a `.desc` field in a precursor parse_unicode_escape_in_byte = unicode escape in byte string .label = {parse_unicode_escape_in_byte} .help = unicode escape sequences cannot be used as a byte or in a byte string diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index 20ec4a300c1f8..c68589ed5a51b 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -8,9 +8,8 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind}; use rustc_ast::tokenstream::TokenStream; use rustc_ast::util::unicode::contains_text_flow_control_chars; use rustc_errors::{error_code, Applicability, DiagCtxt, DiagnosticBuilder, StashKey}; -use rustc_lexer::unescape::{self, EscapeError, Mode}; -use rustc_lexer::{Base, DocStyle, RawStrError}; -use rustc_lexer::{Cursor, LiteralKind}; +use rustc_lexer::unescape::{self, EscapeError, Mode, Rfc3349}; +use rustc_lexer::{Base, Cursor, DocStyle, LiteralKind, RawStrError}; use rustc_session::lint::builtin::{ RUST_2021_PREFIXES_INCOMPATIBLE_SYNTAX, TEXT_DIRECTION_CODEPOINT_IN_COMMENT, }; @@ -436,7 +435,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> { .with_code(error_code!(E0766)) .emit() } - self.cook_unicode(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " + self.cook_mixed(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " } rustc_lexer::LiteralKind::CStr { terminated } => { if !terminated { @@ -697,13 +696,13 @@ impl<'sess, 'src> StringReader<'sess, 'src> { end: BytePos, prefix_len: u32, postfix_len: u32, - unescape: fn(&str, Mode, &mut dyn FnMut(Range, Result<(), EscapeError>)), + unescape: fn(&str, Mode, &mut dyn FnMut(Range, Result<(), EscapeError>)) -> Rfc3349, ) -> (token::LitKind, Symbol) { let mut has_fatal_err = false; let content_start = start + BytePos(prefix_len); let content_end = end - BytePos(postfix_len); let lit_content = self.str_from_to(content_start, content_end); - unescape(lit_content, mode, &mut |range, result| { + let rfc3349 = unescape(lit_content, mode, &mut |range, result| { // Here we only check for errors. The actual unescaping is done later. if let Err(err) = result { let span_with_quotes = self.mk_sp(start, end); @@ -725,6 +724,9 @@ impl<'sess, 'src> StringReader<'sess, 'src> { ); } }); + if rfc3349 == Rfc3349::Used { + self.sess.gated_spans.gate(sym::mixed_utf8_literals, self.mk_sp(start, end)); + } // We normally exclude the quotes for the symbol, but for errors we // include it because it results in clearer error messages. diff --git a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs index 3238f8e23bb0a..4b4d96280375a 100644 --- a/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs +++ b/compiler/rustc_parse/src/lexer/unescape_error_reporting.rs @@ -129,7 +129,10 @@ pub(crate) fn emit_unescape_error( EscapeError::InvalidEscape => { let (c, span) = last_char(); - let label = if mode == Mode::Byte || mode == Mode::ByteStr { + // njn: missing CByteStr, fix in a precursor + // njn: actually, use predicates much more in this function rather + // than ad hoc comparisons like this + let label = if matches!(mode, Mode::Byte | Mode::ByteStr) { "unknown byte escape" } else { "unknown character escape" @@ -175,6 +178,7 @@ pub(crate) fn emit_unescape_error( EscapeError::NonAsciiCharInByte => { let (c, span) = last_char(); let desc = match mode { + // Note: once rfc3349 stabilizes, only `Mode::Byte` will be reachable here. Mode::Byte => "byte literal", Mode::ByteStr => "byte string literal", Mode::RawByteStr => "raw byte string literal", @@ -188,7 +192,7 @@ pub(crate) fn emit_unescape_error( }; err.span_label(span, format!("must be ASCII{postfix}")); // Note: the \\xHH suggestions are not given for raw byte string - // literals, because they are araw and so cannot use any escapes. + // literals, because they cannot use escapes. if (c as u32) <= 0xFF && mode != Mode::RawByteStr { err.span_suggestion( span, diff --git a/compiler/rustc_parse_format/src/lib.rs b/compiler/rustc_parse_format/src/lib.rs index d76ee161da6fd..fea262aa729bd 100644 --- a/compiler/rustc_parse_format/src/lib.rs +++ b/compiler/rustc_parse_format/src/lib.rs @@ -1056,7 +1056,9 @@ fn find_width_map_from_snippet( fn unescape_string(string: &str) -> Option { let mut buf = string::String::new(); let mut ok = true; - unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| { + // njn: argh, need to use the Rfc3349 return value + // - oh, can I just ignore, because already checked in lexer? test that + _ = unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| { match unescaped_char { Ok(c) => buf.push(c), Err(_) => ok = false, diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 6c39a38750ec9..42de7ddd1b2c6 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1060,6 +1060,7 @@ symbols! { mir_unwind_unreachable, mir_variant, miri, + mixed_utf8_literals, mmx_reg, modifiers, module, diff --git a/src/doc/unstable-book/src/language-features/mixed-utf8-literals.md b/src/doc/unstable-book/src/language-features/mixed-utf8-literals.md new file mode 100644 index 0000000000000..f54d8fe98b381 --- /dev/null +++ b/src/doc/unstable-book/src/language-features/mixed-utf8-literals.md @@ -0,0 +1,9 @@ +# `mixed_utf8_literals` + +The tracking issue for this feature is: [#116907] + +[#116907]: https://github.com/rust-lang/rust/issues/116907 + +------------------------ + +njn: short description diff --git a/src/tools/clippy/clippy_dev/src/update_lints.rs b/src/tools/clippy/clippy_dev/src/update_lints.rs index f598f5d3d50f8..6b76a44debff7 100644 --- a/src/tools/clippy/clippy_dev/src/update_lints.rs +++ b/src/tools/clippy/clippy_dev/src/update_lints.rs @@ -928,7 +928,7 @@ fn remove_line_splices(s: &str) -> String { .and_then(|s| s.strip_suffix('"')) .unwrap_or_else(|| panic!("expected quoted string, found `{s}`")); let mut res = String::with_capacity(s.len()); - unescape::unescape_unicode(s, unescape::Mode::Str, &mut |range, ch| { + unescape::unescape_literal(s, unescape::Mode::Str, &mut |range, ch| { if ch.is_ok() { res.push_str(&s[range]); } diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs index bf1feb9a7eb07..f63a499a1be98 100644 --- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs +++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs @@ -365,9 +365,11 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str { EscapeError::NonAsciiCharInByte if mode == Mode::Byte => { "non-ASCII character in byte literal" } - EscapeError::NonAsciiCharInByte if mode == Mode::ByteStr => { + // Note: once rfc3349 stabilizes, this arm will be unreachable. + EscapeError::NonAsciiCharInByte if matches!(mode, Mode::ByteStr) => { "non-ASCII character in byte string literal" } + // Note: once rfc3349 stabilizes, this arm will be unreachable. EscapeError::NonAsciiCharInByte => "non-ASCII character in raw byte string literal", EscapeError::NulInCStr => "null character in C string literal", EscapeError::UnskippedWhitespaceWarning => "", @@ -378,15 +380,17 @@ fn error_to_diagnostic_message(error: EscapeError, mode: Mode) -> &'static str { fn unescape_string_error_message(text: &str, mode: Mode) -> &'static str { let mut error_message = ""; match mode { - Mode::CStr => { - rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| { + Mode::ByteStr | Mode::CStr => { + // Can ignore the `Rfc3349` return value. + _ = rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| { if let Err(e) = res { error_message = error_to_diagnostic_message(e, mode); } }); } - Mode::ByteStr | Mode::Str => { - rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| { + Mode::Str => { + // Can ignore the `Rfc3349` return value. + _ = rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| { if let Err(e) = res { error_message = error_to_diagnostic_message(e, mode); } diff --git a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs index 7cd1f1550b988..da89fefe7914c 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs @@ -193,7 +193,8 @@ pub trait IsString: AstToken { let text = &self.text()[text_range_no_quotes - start]; let offset = text_range_no_quotes.start() - start; - unescape_unicode(text, Self::MODE, &mut |range, unescaped_char| { + // Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals. + _ = unescape_unicode(text, Self::MODE, &mut |range, unescaped_char| { let text_range = TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); cb(text_range + offset, unescaped_char); @@ -226,7 +227,8 @@ impl ast::String { let mut buf = String::new(); let mut prev_end = 0; let mut has_error = false; - unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( + // Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals. + _ = unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( unescaped_char, buf.capacity() == 0, ) { @@ -253,9 +255,34 @@ impl ast::String { impl IsString for ast::ByteString { const RAW_PREFIX: &'static str = "br"; const MODE: Mode = Mode::ByteStr; + + // njn: duplicates CString::escaped_char_ranges + fn escaped_char_ranges( + &self, + cb: &mut dyn FnMut(TextRange, Result), + ) { + let text_range_no_quotes = match self.text_range_between_quotes() { + Some(it) => it, + None => return, + }; + + let start = self.syntax().text_range().start(); + let text = &self.text()[text_range_no_quotes - start]; + let offset = text_range_no_quotes.start() - start; + + // Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals. + _ = unescape_mixed(text, Self::MODE, &mut |range, unescaped_char| { + let text_range = + TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); + // XXX: This method should only be used for highlighting ranges. The unescaped + // char/byte is not used. For simplicity, we return an arbitrary placeholder char. + cb(text_range + offset, unescaped_char.map(|_| ' ')); + }); + } } impl ast::ByteString { + // njn: duplicates CString::value pub fn value(&self) -> Option> { if self.is_raw() { let text = self.text(); @@ -270,18 +297,23 @@ impl ast::ByteString { let mut buf: Vec = Vec::new(); let mut prev_end = 0; let mut has_error = false; - unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( + let extend_unit = |buf: &mut Vec, unit: MixedUnit| match unit { + MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()), + MixedUnit::HighByte(b) => buf.push(b), + }; + // Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals. + _ = unescape_mixed(text, Self::MODE, &mut |char_range, unescaped_char| match ( unescaped_char, buf.capacity() == 0, ) { - (Ok(c), false) => buf.push(c as u8), + (Ok(u), false) => extend_unit(&mut buf, u), (Ok(_), true) if char_range.len() == 1 && char_range.start == prev_end => { prev_end = char_range.end } - (Ok(c), true) => { + (Ok(u), true) => { buf.reserve_exact(text.len()); - buf.extend_from_slice(text[..prev_end].as_bytes()); - buf.push(c as u8); + buf.extend(text[..prev_end].as_bytes()); + extend_unit(&mut buf, u); } (Err(_), _) => has_error = true, }); @@ -311,7 +343,8 @@ impl IsString for ast::CString { let text = &self.text()[text_range_no_quotes - start]; let offset = text_range_no_quotes.start() - start; - unescape_mixed(text, Self::MODE, &mut |range, unescaped_char| { + // Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals. + _ = unescape_mixed(text, Self::MODE, &mut |range, unescaped_char| { let text_range = TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); // XXX: This method should only be used for highlighting ranges. The unescaped @@ -340,7 +373,8 @@ impl ast::CString { MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()), MixedUnit::HighByte(b) => buf.push(b), }; - unescape_mixed(text, Self::MODE, &mut |char_range, unescaped| match ( + // Ignores the `Rfc3349` return value, thus permitting mixed utf8 literals. + _ = unescape_mixed(text, Self::MODE, &mut |char_range, unescaped| match ( unescaped, buf.capacity() == 0, ) { diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs index 5c5b26f525f66..fae7b37e9b2c4 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs @@ -136,11 +136,13 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { } }; + // Ignores the `Rfc3349` return value from the `unescape_*` functions, thus + // permitting mixed utf8 literals. match literal.kind() { ast::LiteralKind::String(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 1, '"') { - unescape_unicode(without_quotes, Mode::Str, &mut |range, char| { + _ = unescape_unicode(without_quotes, Mode::Str, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -151,7 +153,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::ByteString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_unicode(without_quotes, Mode::ByteStr, &mut |range, char| { + _ = unescape_mixed(without_quotes, Mode::ByteStr, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -162,7 +164,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::CString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| { + _ = unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -172,7 +174,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { } ast::LiteralKind::Char(_) => { if let Some(without_quotes) = unquote(text, 1, '\'') { - unescape_unicode(without_quotes, Mode::Char, &mut |range, char| { + _ = unescape_unicode(without_quotes, Mode::Char, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -181,7 +183,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { } ast::LiteralKind::Byte(_) => { if let Some(without_quotes) = unquote(text, 2, '\'') { - unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| { + _ = unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| { if let Err(err) = char { push_err(2, range.start, err); } diff --git a/tests/ui/attributes/key-value-non-ascii.rs b/tests/ui/attributes/key-value-non-ascii.rs index e14e2fc05ad39..8e0bb6bc50ca6 100644 --- a/tests/ui/attributes/key-value-non-ascii.rs +++ b/tests/ui/attributes/key-value-non-ascii.rs @@ -1,4 +1,4 @@ #![feature(rustc_attrs)] -#[rustc_dummy = b"ffi.rs"] //~ ERROR non-ASCII character in byte string literal +#[rustc_dummy = b'ffi'] //~ ERROR non-ASCII character in byte literal fn main() {} diff --git a/tests/ui/attributes/key-value-non-ascii.stderr b/tests/ui/attributes/key-value-non-ascii.stderr index cc01bc46ebd29..e9b6947bf4cba 100644 --- a/tests/ui/attributes/key-value-non-ascii.stderr +++ b/tests/ui/attributes/key-value-non-ascii.stderr @@ -1,13 +1,11 @@ -error: non-ASCII character in byte string literal +error: non-ASCII character in byte literal --> $DIR/key-value-non-ascii.rs:3:19 | -LL | #[rustc_dummy = b"ffi.rs"] - | ^ must be ASCII - | -help: if you meant to use the UTF-8 encoding of 'ffi', use \xHH escapes - | -LL | #[rustc_dummy = b"/xEF/xAC/x83.rs"] - | ~~~~~~~~~~~~ +LL | #[rustc_dummy = b'ffi'] + | ^ + | | + | must be ASCII + | this multibyte character does not fit into a single byte error: aborting due to 1 previous error diff --git a/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.rs b/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.rs new file mode 100644 index 0000000000000..d037ea1a51876 --- /dev/null +++ b/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.rs @@ -0,0 +1,5 @@ +fn main() { + _ = b"a¥🦀"; //~ ERROR mixed utf8 + _ = br"a¥🦀"; //~ ERROR mixed utf8 + _ = b"a\u{a5}\u{1f980}"; //~ ERROR mixed utf8 +} diff --git a/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.stderr b/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.stderr new file mode 100644 index 0000000000000..bdff26269b720 --- /dev/null +++ b/tests/ui/feature-gates/feature-gate-mixed-utf8-literals.stderr @@ -0,0 +1,33 @@ +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/feature-gate-mixed-utf8-literals.rs:2:9 + | +LL | _ = b"a¥🦀"; + | ^^^^^^^ + | + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date + +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/feature-gate-mixed-utf8-literals.rs:3:9 + | +LL | _ = br"a¥🦀"; + | ^^^^^^^^ + | + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date + +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/feature-gate-mixed-utf8-literals.rs:4:9 + | +LL | _ = b"a\u{a5}\u{1f980}"; + | ^^^^^^^^^^^^^^^^^^^ + | + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date + +error: aborting due to 3 previous errors + +For more information about this error, try `rustc --explain E0658`. diff --git a/tests/ui/parser/byte-string-literals.rs b/tests/ui/parser/byte-string-literals.rs index 30a4f50c4e40b..dae941d342f70 100644 --- a/tests/ui/parser/byte-string-literals.rs +++ b/tests/ui/parser/byte-string-literals.rs @@ -3,7 +3,5 @@ static FOO: &'static [u8] = b"\f"; //~ ERROR unknown byte escape pub fn main() { b"\f"; //~ ERROR unknown byte escape b"\x0Z"; //~ ERROR invalid character in numeric character escape: `Z` - b"é"; //~ ERROR non-ASCII character in byte string literal - br##"é"##; //~ ERROR non-ASCII character in raw byte string literal b"a //~ ERROR unterminated double quote byte string } diff --git a/tests/ui/parser/byte-string-literals.stderr b/tests/ui/parser/byte-string-literals.stderr index 655b6998e85ff..2186b4c2e494c 100644 --- a/tests/ui/parser/byte-string-literals.stderr +++ b/tests/ui/parser/byte-string-literals.stderr @@ -20,31 +20,14 @@ error: invalid character in numeric character escape: `Z` LL | b"\x0Z"; | ^ invalid character in numeric character escape -error: non-ASCII character in byte string literal - --> $DIR/byte-string-literals.rs:6:7 - | -LL | b"é"; - | ^ must be ASCII - | -help: if you meant to use the unicode code point for 'é', use a \xHH escape - | -LL | b"\xE9"; - | ~~~~ - -error: non-ASCII character in raw byte string literal - --> $DIR/byte-string-literals.rs:7:10 - | -LL | br##"é"##; - | ^ must be ASCII - error[E0766]: unterminated double quote byte string - --> $DIR/byte-string-literals.rs:8:6 + --> $DIR/byte-string-literals.rs:6:6 | LL | b"a | ______^ LL | | } | |__^ -error: aborting due to 6 previous errors +error: aborting due to 4 previous errors For more information about this error, try `rustc --explain E0766`. diff --git a/tests/ui/parser/issues/issue-23620-invalid-escapes.rs b/tests/ui/parser/issues/issue-23620-invalid-escapes.rs index c1355f0d6fe0c..6652a64700745 100644 --- a/tests/ui/parser/issues/issue-23620-invalid-escapes.rs +++ b/tests/ui/parser/issues/issue-23620-invalid-escapes.rs @@ -1,6 +1,6 @@ fn main() { let _ = b"\u{a66e}"; - //~^ ERROR unicode escape in byte string + //~^ ERROR mixed utf8 b"" and br"" literals are experimental let _ = b'\u{a66e}'; //~^ ERROR unicode escape in byte string @@ -21,7 +21,7 @@ fn main() { //~^ ERROR invalid character in numeric character escape: `x` let _ = b"\u{a4a4} \xf \u"; - //~^ ERROR unicode escape in byte string + //~^ ERROR mixed utf8 b"" and br"" literals are experimental //~^^ ERROR invalid character in numeric character escape: ` ` //~^^^ ERROR incorrect unicode escape sequence diff --git a/tests/ui/parser/issues/issue-23620-invalid-escapes.stderr b/tests/ui/parser/issues/issue-23620-invalid-escapes.stderr index 88d97c795fc2a..4b3afdb5523c9 100644 --- a/tests/ui/parser/issues/issue-23620-invalid-escapes.stderr +++ b/tests/ui/parser/issues/issue-23620-invalid-escapes.stderr @@ -1,11 +1,3 @@ -error: unicode escape in byte string - --> $DIR/issue-23620-invalid-escapes.rs:2:15 - | -LL | let _ = b"\u{a66e}"; - | ^^^^^^^^ unicode escape in byte string - | - = help: unicode escape sequences cannot be used as a byte or in a byte string - error: unicode escape in byte string --> $DIR/issue-23620-invalid-escapes.rs:5:15 | @@ -46,14 +38,6 @@ error: invalid character in numeric character escape: `x` LL | let _ = '\xxy'; | ^ invalid character in numeric character escape -error: unicode escape in byte string - --> $DIR/issue-23620-invalid-escapes.rs:23:15 - | -LL | let _ = b"\u{a4a4} \xf \u"; - | ^^^^^^^^ unicode escape in byte string - | - = help: unicode escape sequences cannot be used as a byte or in a byte string - error: invalid character in numeric character escape: ` ` --> $DIR/issue-23620-invalid-escapes.rs:23:27 | @@ -90,5 +74,26 @@ LL | let _ = "\u8f"; | | | help: format of unicode escape sequences uses braces: `\u{8f}` +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/issue-23620-invalid-escapes.rs:2:13 + | +LL | let _ = b"\u{a66e}"; + | ^^^^^^^^^^^ + | + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date + +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/issue-23620-invalid-escapes.rs:23:13 + | +LL | let _ = b"\u{a4a4} \xf \u"; + | ^^^^^^^^^^^^^^^^^^ + | + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date + error: aborting due to 13 previous errors +For more information about this error, try `rustc --explain E0658`. diff --git a/tests/ui/parser/raw/raw-byte-string-literals.rs b/tests/ui/parser/raw/raw-byte-string-literals.rs index 1b859fee596ad..c485fca5523d6 100644 --- a/tests/ui/parser/raw/raw-byte-string-literals.rs +++ b/tests/ui/parser/raw/raw-byte-string-literals.rs @@ -2,6 +2,5 @@ pub fn main() { br"a "; //~ ERROR bare CR not allowed in raw string - br"é"; //~ ERROR non-ASCII character in raw byte string literal br##~"a"~##; //~ ERROR only `#` is allowed in raw string delimitation } diff --git a/tests/ui/parser/raw/raw-byte-string-literals.stderr b/tests/ui/parser/raw/raw-byte-string-literals.stderr index a2f27d1ed70ae..134067b25e93d 100644 --- a/tests/ui/parser/raw/raw-byte-string-literals.stderr +++ b/tests/ui/parser/raw/raw-byte-string-literals.stderr @@ -4,17 +4,11 @@ error: bare CR not allowed in raw string LL | br"a "; | ^ -error: non-ASCII character in raw byte string literal - --> $DIR/raw-byte-string-literals.rs:5:8 - | -LL | br"é"; - | ^ must be ASCII - error: found invalid character; only `#` is allowed in raw string delimitation: ~ - --> $DIR/raw-byte-string-literals.rs:6:5 + --> $DIR/raw-byte-string-literals.rs:5:5 | LL | br##~"a"~##; | ^^^^^ -error: aborting due to 3 previous errors +error: aborting due to 2 previous errors diff --git a/tests/ui/parser/unicode-control-codepoints.rs b/tests/ui/parser/unicode-control-codepoints.rs index df099bb62ad1e..4a7e3aab08a7a 100644 --- a/tests/ui/parser/unicode-control-codepoints.rs +++ b/tests/ui/parser/unicode-control-codepoints.rs @@ -4,8 +4,7 @@ fn main() { println!("us\u{202B}e\u{202A}r"); println!("{:?}", r#"us\u{202B}e\u{202A}r"#); println!("{:?}", b"us\u{202B}e\u{202A}r"); - //~^ ERROR unicode escape in byte string - //~| ERROR unicode escape in byte string + //~^ ERROR mixed utf8 b"" and br"" literals are experimental println!("{:?}", br##"us\u{202B}e\u{202A}r"##); println!("{:?}", "/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "); @@ -14,15 +13,9 @@ fn main() { println!("{:?}", r##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##); //~^ ERROR unicode codepoint changing visible direction of text present in literal println!("{:?}", b"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "); - //~^ ERROR non-ASCII character in byte string literal - //~| ERROR non-ASCII character in byte string literal - //~| ERROR non-ASCII character in byte string literal - //~| ERROR non-ASCII character in byte string literal + //~^ ERROR mixed utf8 b"" and br"" literals are experimental println!("{:?}", br##"/*‮ } ⁦if isAdmin⁩ ⁦ begin admins only "##); - //~^ ERROR non-ASCII character in raw byte string literal - //~| ERROR non-ASCII character in raw byte string literal - //~| ERROR non-ASCII character in raw byte string literal - //~| ERROR non-ASCII character in raw byte string literal + //~^ ERROR mixed utf8 b"" and br"" literals are experimental println!("{:?}", '‮'); //~^ ERROR unicode codepoint changing visible direction of text present in literal } diff --git a/tests/ui/parser/unicode-control-codepoints.stderr b/tests/ui/parser/unicode-control-codepoints.stderr index fc071a9419142..ff3668829e37a 100644 --- a/tests/ui/parser/unicode-control-codepoints.stderr +++ b/tests/ui/parser/unicode-control-codepoints.stderr @@ -1,86 +1,32 @@ -error: unicode escape in byte string - --> $DIR/unicode-control-codepoints.rs:6:26 +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/unicode-control-codepoints.rs:6:22 | LL | println!("{:?}", b"us\u{202B}e\u{202A}r"); - | ^^^^^^^^ unicode escape in byte string + | ^^^^^^^^^^^^^^^^^^^^^^^ | - = help: unicode escape sequences cannot be used as a byte or in a byte string + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date -error: unicode escape in byte string - --> $DIR/unicode-control-codepoints.rs:6:35 - | -LL | println!("{:?}", b"us\u{202B}e\u{202A}r"); - | ^^^^^^^^ unicode escape in byte string - | - = help: unicode escape sequences cannot be used as a byte or in a byte string - -error: non-ASCII character in byte string literal - --> $DIR/unicode-control-codepoints.rs:16:26 - | -LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ must be ASCII but is '\u{202e}' - | -help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes - | -LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only "); - | ~~~~~~~~~~~~ - -error: non-ASCII character in byte string literal - --> $DIR/unicode-control-codepoints.rs:16:30 - | -LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ must be ASCII but is '\u{2066}' - | -help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes - | -LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only "); - | ~~~~~~~~~~~~ - -error: non-ASCII character in byte string literal - --> $DIR/unicode-control-codepoints.rs:16:41 +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/unicode-control-codepoints.rs:15:22 | LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ must be ASCII but is '\u{2069}' + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | -help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes - | -LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only "); - | ~~~~~~~~~~~~ - -error: non-ASCII character in byte string literal - --> $DIR/unicode-control-codepoints.rs:16:43 - | -LL | println!("{:?}", b"/* } if isAdmin begin admins only "); - | ^ must be ASCII but is '\u{2066}' - | -help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes - | -LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only "); - | ~~~~~~~~~~~~ - -error: non-ASCII character in raw byte string literal - --> $DIR/unicode-control-codepoints.rs:21:29 - | -LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); - | ^ must be ASCII but is '\u{202e}' + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date -error: non-ASCII character in raw byte string literal - --> $DIR/unicode-control-codepoints.rs:21:33 +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/unicode-control-codepoints.rs:17:22 | LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); - | ^ must be ASCII but is '\u{2066}' - -error: non-ASCII character in raw byte string literal - --> $DIR/unicode-control-codepoints.rs:21:44 + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | -LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); - | ^ must be ASCII but is '\u{2069}' - -error: non-ASCII character in raw byte string literal - --> $DIR/unicode-control-codepoints.rs:21:46 - | -LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##); - | ^ must be ASCII but is '\u{2066}' + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date error: unicode codepoint changing visible direction of text present in comment --> $DIR/unicode-control-codepoints.rs:2:5 @@ -97,7 +43,7 @@ LL | // if access_level != "user" { // Check if admin = help: if their presence wasn't intentional, you can remove them error: unicode codepoint changing visible direction of text present in comment - --> $DIR/unicode-control-codepoints.rs:30:1 + --> $DIR/unicode-control-codepoints.rs:23:1 | LL | //"/* } if isAdmin begin admins only */" | ^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^ @@ -112,7 +58,7 @@ LL | //"/* } if isAdmin begin admins only */" = help: if their presence wasn't intentional, you can remove them error: unicode codepoint changing visible direction of text present in literal - --> $DIR/unicode-control-codepoints.rs:11:22 + --> $DIR/unicode-control-codepoints.rs:10:22 | LL | println!("{:?}", "/* } if isAdmin begin admins only "); | ^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^ @@ -132,7 +78,7 @@ LL | println!("{:?}", "/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} begi | ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ error: unicode codepoint changing visible direction of text present in literal - --> $DIR/unicode-control-codepoints.rs:14:22 + --> $DIR/unicode-control-codepoints.rs:13:22 | LL | println!("{:?}", r##"/* } if isAdmin begin admins only "##); | ^^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^ @@ -151,7 +97,7 @@ LL | println!("{:?}", r##"/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} b | ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ ~~~~~~~~ error: unicode codepoint changing visible direction of text present in literal - --> $DIR/unicode-control-codepoints.rs:26:22 + --> $DIR/unicode-control-codepoints.rs:19:22 | LL | println!("{:?}", ''); | ^- @@ -167,7 +113,7 @@ LL | println!("{:?}", '\u{202e}'); | ~~~~~~~~ error: unicode codepoint changing visible direction of text present in doc comment - --> $DIR/unicode-control-codepoints.rs:33:1 + --> $DIR/unicode-control-codepoints.rs:26:1 | LL | /** ''); */fn foo() {} | ^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint @@ -177,7 +123,7 @@ LL | /** ''); */fn foo() {} = note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}' error: unicode codepoint changing visible direction of text present in doc comment - --> $DIR/unicode-control-codepoints.rs:36:1 + --> $DIR/unicode-control-codepoints.rs:29:1 | LL | / /** LL | | * @@ -188,5 +134,6 @@ LL | | * ''); */fn bar() {} = note: if their presence wasn't intentional, you can remove them = note: if you want to keep them but make them visible in your source code, you can escape them: '\u{202e}' -error: aborting due to 17 previous errors +error: aborting due to 10 previous errors +For more information about this error, try `rustc --explain E0658`. diff --git a/tests/ui/suggestions/multibyte-escapes.rs b/tests/ui/suggestions/multibyte-escapes.rs index c4105186244db..7f5e32eef55d5 100644 --- a/tests/ui/suggestions/multibyte-escapes.rs +++ b/tests/ui/suggestions/multibyte-escapes.rs @@ -12,7 +12,8 @@ fn main() { //~| NOTE: must be ASCII b"字"; - //~^ ERROR: non-ASCII character in byte string literal - //~| HELP: if you meant to use the UTF-8 encoding of '字', use \xHH escapes - //~| NOTE: must be ASCII + //~^ ERROR: mixed utf8 b"" and br"" literals are experimental + //~| NOTE: see issue #116907 + //~| HELP: add `#![feature(mixed_utf8_literals)]` + //~| NOTE: this compiler was built on YYYY-MM-DD } diff --git a/tests/ui/suggestions/multibyte-escapes.stderr b/tests/ui/suggestions/multibyte-escapes.stderr index 1e7c43e6538f6..4baf930eb3443 100644 --- a/tests/ui/suggestions/multibyte-escapes.stderr +++ b/tests/ui/suggestions/multibyte-escapes.stderr @@ -18,16 +18,16 @@ LL | b'字'; | must be ASCII | this multibyte character does not fit into a single byte -error: non-ASCII character in byte string literal - --> $DIR/multibyte-escapes.rs:14:7 +error[E0658]: mixed utf8 b"" and br"" literals are experimental + --> $DIR/multibyte-escapes.rs:14:5 | LL | b"字"; - | ^^ must be ASCII + | ^^^^^ | -help: if you meant to use the UTF-8 encoding of '字', use \xHH escapes - | -LL | b"\xE5\xAD\x97"; - | ~~~~~~~~~~~~ + = note: see issue #116907 for more information + = help: add `#![feature(mixed_utf8_literals)]` to the crate attributes to enable + = note: this compiler was built on YYYY-MM-DD; consider upgrading it if it is out of date error: aborting due to 3 previous errors +For more information about this error, try `rustc --explain E0658`.