diff --git a/src/librustc_ast/util/literal.rs b/src/librustc_ast/util/literal.rs index 1b17f343a6d67..4428d09902b92 100644 --- a/src/librustc_ast/util/literal.rs +++ b/src/librustc_ast/util/literal.rs @@ -6,8 +6,7 @@ use crate::tokenstream::TokenTree; use rustc_data_structures::sync::Lrc; use rustc_lexer::unescape::{unescape_byte, unescape_char}; -use rustc_lexer::unescape::{unescape_byte_str, unescape_str}; -use rustc_lexer::unescape::{unescape_raw_byte_str, unescape_raw_str}; +use rustc_lexer::unescape::{unescape_byte_literal, unescape_literal, Mode}; use rustc_span::symbol::{kw, sym, Symbol}; use rustc_span::Span; @@ -59,45 +58,53 @@ impl LitKind { // new symbol because the string in the LitKind is different to the // string in the token. let s = symbol.as_str(); - let symbol = if s.contains(&['\\', '\r'][..]) { - let mut buf = String::with_capacity(s.len()); - let mut error = Ok(()); - unescape_str(&s, &mut |_, unescaped_char| match unescaped_char { - Ok(c) => buf.push(c), - Err(_) => error = Err(LitError::LexerError), - }); - error?; - Symbol::intern(&buf) - } else { - symbol - }; + let symbol = + if s.contains(&['\\', '\r'][..]) { + let mut buf = String::with_capacity(s.len()); + let mut error = Ok(()); + unescape_literal(&s, Mode::Str, &mut |_, unescaped_char| { + match unescaped_char { + Ok(c) => buf.push(c), + Err(_) => error = Err(LitError::LexerError), + } + }); + error?; + Symbol::intern(&buf) + } else { + symbol + }; LitKind::Str(symbol, ast::StrStyle::Cooked) } token::StrRaw(n) => { // Ditto. let s = symbol.as_str(); - let symbol = if s.contains('\r') { - let mut buf = String::with_capacity(s.len()); - let mut error = Ok(()); - unescape_raw_str(&s, &mut |_, unescaped_char| match unescaped_char { - Ok(c) => buf.push(c), - Err(_) => error = Err(LitError::LexerError), - }); - error?; - buf.shrink_to_fit(); - Symbol::intern(&buf) - } else { - symbol - }; + let symbol = + if s.contains('\r') { + let mut buf = String::with_capacity(s.len()); + let mut error = Ok(()); + unescape_literal(&s, Mode::RawStr, &mut |_, unescaped_char| { + match unescaped_char { + Ok(c) => buf.push(c), + Err(_) => error = Err(LitError::LexerError), + } + }); + error?; + buf.shrink_to_fit(); + Symbol::intern(&buf) + } else { + symbol + }; LitKind::Str(symbol, ast::StrStyle::Raw(n)) } token::ByteStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); let mut error = Ok(()); - unescape_byte_str(&s, &mut |_, unescaped_byte| match unescaped_byte { - Ok(c) => buf.push(c), - Err(_) => error = Err(LitError::LexerError), + unescape_byte_literal(&s, Mode::ByteStr, &mut |_, unescaped_byte| { + match unescaped_byte { + Ok(c) => buf.push(c), + Err(_) => error = Err(LitError::LexerError), + } }); error?; buf.shrink_to_fit(); @@ -108,9 +115,11 @@ impl LitKind { let bytes = if s.contains('\r') { let mut buf = Vec::with_capacity(s.len()); let mut error = Ok(()); - unescape_raw_byte_str(&s, &mut |_, unescaped_byte| match unescaped_byte { - Ok(c) => buf.push(c), - Err(_) => error = Err(LitError::LexerError), + unescape_byte_literal(&s, Mode::RawByteStr, &mut |_, unescaped_byte| { + match unescaped_byte { + Ok(c) => buf.push(c), + Err(_) => error = Err(LitError::LexerError), + } }); error?; buf.shrink_to_fit(); diff --git a/src/librustc_lexer/src/lib.rs b/src/librustc_lexer/src/lib.rs index 5ccfc1b276bfa..e44feee96607a 100644 --- a/src/librustc_lexer/src/lib.rs +++ b/src/librustc_lexer/src/lib.rs @@ -1,5 +1,11 @@ //! Low-level Rust lexer. //! +//! The idea with `librustc_lexer` is to make a reusable library, +//! by separating out pure lexing and rustc-specific concerns, like spans, +//! error reporting an interning. So, rustc_lexer operates directly on `&str`, +//! produces simple tokens which are a pair of type-tag and a bit of original text, +//! and does not report errors, instead storing them as flags on the token. +//! //! Tokens produced by this lexer are not yet ready for parsing the Rust syntax, //! for that see `librustc_parse::lexer`, which converts this basic token stream //! into wide tokens used by actual parser. @@ -719,6 +725,9 @@ impl Cursor<'_> { // Check that amount of closing '#' symbols // is equal to the amount of opening ones. + // Note that this will not consume extra trailing `#` characters: + // `r###"abcde"####` is lexed as a `LexedRawString { n_hashes: 3 }` + // followed by a `#` token. let mut hashes_left = n_start_hashes; let is_closing_hash = |c| { if c == '#' && hashes_left != 0 { @@ -739,8 +748,8 @@ impl Cursor<'_> { possible_terminator_offset: None, }; } else if n_end_hashes > max_hashes { - // Keep track of possible terminators to give a hint about where there might be - // a missing terminator + // Keep track of possible terminators to give a hint about + // where there might be a missing terminator possible_terminator_offset = Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len); max_hashes = n_end_hashes; diff --git a/src/librustc_lexer/src/unescape.rs b/src/librustc_lexer/src/unescape.rs index c51bf735fa727..2a9e1b7cbc346 100644 --- a/src/librustc_lexer/src/unescape.rs +++ b/src/librustc_lexer/src/unescape.rs @@ -58,69 +58,57 @@ pub enum EscapeError { NonAsciiCharInByteString, } -/// Takes a contents of a char literal (without quotes), and returns an -/// unescaped char or an error -pub fn unescape_char(literal_text: &str) -> Result { - let mut chars = literal_text.chars(); - unescape_char_or_byte(&mut chars, Mode::Char) - .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) -} - -/// Takes a contents of a byte literal (without quotes), and returns an -/// unescaped byte or an error. -pub fn unescape_byte(literal_text: &str) -> Result { - let mut chars = literal_text.chars(); - unescape_char_or_byte(&mut chars, Mode::Byte) - .map(byte_from_char) - .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) -} - -/// Takes a contents of a string literal (without quotes) and produces a +/// Takes a contents of a literal (without quotes) and produces a /// sequence of escaped characters or errors. /// Values are returned through invoking of the provided callback. -pub fn unescape_str(literal_text: &str, callback: &mut F) +pub fn unescape_literal(literal_text: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { - unescape_str_or_byte_str(literal_text, Mode::Str, callback) + match mode { + Mode::Char | Mode::Byte => { + let mut chars = literal_text.chars(); + let result = unescape_char_or_byte(&mut chars, mode); + // The Chars iterator moved forward. + callback(0..(literal_text.len() - chars.as_str().len()), result); + } + Mode::Str | Mode::ByteStr => unescape_str_or_byte_str(literal_text, mode, callback), + // NOTE: Raw strings do not perform any explicit character escaping, here we + // only translate CRLF to LF and produce errors on bare CR. + Mode::RawStr | Mode::RawByteStr => { + unescape_raw_str_or_byte_str(literal_text, mode, callback) + } + } } -/// Takes a contents of a byte string literal (without quotes) and produces a -/// sequence of bytes or errors. +/// Takes a contents of a byte, byte string or raw byte string (without quotes) +/// and produces a sequence of bytes or errors. /// Values are returned through invoking of the provided callback. -pub fn unescape_byte_str(literal_text: &str, callback: &mut F) +pub fn unescape_byte_literal(literal_text: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { - unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { - callback(range, char.map(byte_from_char)) + assert!(mode.is_bytes()); + unescape_literal(literal_text, mode, &mut |range, result| { + callback(range, result.map(byte_from_char)); }) } -/// Takes a contents of a raw string literal (without quotes) and produces a -/// sequence of characters or errors. -/// Values are returned through invoking of the provided callback. -/// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only translate CRLF to LF and produce errors on bare CR. -pub fn unescape_raw_str(literal_text: &str, callback: &mut F) -where - F: FnMut(Range, Result), -{ - unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback) +/// Takes a contents of a char literal (without quotes), and returns an +/// unescaped char or an error +pub fn unescape_char(literal_text: &str) -> Result { + let mut chars = literal_text.chars(); + unescape_char_or_byte(&mut chars, Mode::Char) + .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) } -/// Takes a contents of a raw byte string literal (without quotes) and produces a -/// sequence of bytes or errors. -/// Values are returned through invoking of the provided callback. -/// NOTE: Raw strings do not perform any explicit character escaping, here we -/// only translate CRLF to LF and produce errors on bare CR. -pub fn unescape_raw_byte_str(literal_text: &str, callback: &mut F) -where - F: FnMut(Range, Result), -{ - unescape_raw_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { - callback(range, char.map(byte_from_char)) - }) +/// Takes a contents of a byte literal (without quotes), and returns an +/// unescaped byte or an error. +pub fn unescape_byte(literal_text: &str) -> Result { + let mut chars = literal_text.chars(); + unescape_char_or_byte(&mut chars, Mode::Byte) + .map(byte_from_char) + .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) } /// What kind of literal do we parse. @@ -130,13 +118,15 @@ pub enum Mode { Str, Byte, ByteStr, + RawStr, + RawByteStr, } impl Mode { pub fn in_single_quotes(self) -> bool { match self { Mode::Char | Mode::Byte => true, - Mode::Str | Mode::ByteStr => false, + Mode::Str | Mode::ByteStr | Mode::RawStr | Mode::RawByteStr => false, } } @@ -146,8 +136,8 @@ impl Mode { pub fn is_bytes(self) -> bool { match self { - Mode::Byte | Mode::ByteStr => true, - Mode::Char | Mode::Str => false, + Mode::Byte | Mode::ByteStr | Mode::RawByteStr => true, + Mode::Char | Mode::Str | Mode::RawStr => false, } } } @@ -345,7 +335,7 @@ where fn byte_from_char(c: char) -> u8 { let res = c as u32; - assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte(Str)"); + assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::ByteStr"); res as u8 } diff --git a/src/librustc_lexer/src/unescape/tests.rs b/src/librustc_lexer/src/unescape/tests.rs index e7b1ff6479d88..f2b751a78f27f 100644 --- a/src/librustc_lexer/src/unescape/tests.rs +++ b/src/librustc_lexer/src/unescape/tests.rs @@ -102,7 +102,7 @@ fn test_unescape_char_good() { fn test_unescape_str_good() { fn check(literal_text: &str, expected: &str) { let mut buf = Ok(String::with_capacity(literal_text.len())); - unescape_str(literal_text, &mut |range, c| { + unescape_literal(literal_text, Mode::Str, &mut |range, c| { if let Ok(b) = &mut buf { match c { Ok(c) => b.push(c), @@ -222,7 +222,7 @@ fn test_unescape_byte_good() { fn test_unescape_byte_str_good() { fn check(literal_text: &str, expected: &[u8]) { let mut buf = Ok(Vec::with_capacity(literal_text.len())); - unescape_byte_str(literal_text, &mut |range, c| { + unescape_byte_literal(literal_text, Mode::ByteStr, &mut |range, c| { if let Ok(b) = &mut buf { match c { Ok(c) => b.push(c), @@ -246,7 +246,7 @@ fn test_unescape_byte_str_good() { fn test_unescape_raw_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_raw_str(literal, &mut |range, res| unescaped.push((range, res))); + unescape_literal(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } @@ -258,7 +258,9 @@ fn test_unescape_raw_str() { fn test_unescape_raw_byte_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_raw_byte_str(literal, &mut |range, res| unescaped.push((range, res))); + unescape_byte_literal(literal, Mode::RawByteStr, &mut |range, res| { + unescaped.push((range, res)) + }); assert_eq!(unescaped, expected); } diff --git a/src/librustc_parse/lexer/mod.rs b/src/librustc_parse/lexer/mod.rs index aa048d682c2da..2b7d5e5adb432 100644 --- a/src/librustc_parse/lexer/mod.rs +++ b/src/librustc_parse/lexer/mod.rs @@ -15,6 +15,7 @@ mod tokentrees; mod unescape_error_reporting; mod unicode_chars; +use rustc_lexer::unescape::Mode; use unescape_error_reporting::{emit_unescape_error, push_escaped_char}; #[derive(Clone, Debug)] @@ -325,38 +326,27 @@ impl<'a> StringReader<'a> { suffix_start: BytePos, kind: rustc_lexer::LiteralKind, ) -> (token::LitKind, Symbol) { - match kind { + // prefix means `"` or `br"` or `r###"`, ... + let (lit_kind, mode, prefix_len, postfix_len) = match kind { rustc_lexer::LiteralKind::Char { terminated } => { if !terminated { self.fatal_span_(start, suffix_start, "unterminated character literal").raise() } - let content_start = start + BytePos(1); - let content_end = suffix_start - BytePos(1); - self.validate_char_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::Char, id) + (token::Char, Mode::Char, 1, 1) // ' ' } rustc_lexer::LiteralKind::Byte { terminated } => { if !terminated { self.fatal_span_(start + BytePos(1), suffix_start, "unterminated byte constant") .raise() } - let content_start = start + BytePos(2); - let content_end = suffix_start - BytePos(1); - self.validate_byte_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::Byte, id) + (token::Byte, Mode::Byte, 2, 1) // b' ' } rustc_lexer::LiteralKind::Str { terminated } => { if !terminated { self.fatal_span_(start, suffix_start, "unterminated double quote string") .raise() } - let content_start = start + BytePos(1); - let content_end = suffix_start - BytePos(1); - self.validate_str_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::Str, id) + (token::Str, Mode::Str, 1, 1) // " " } rustc_lexer::LiteralKind::ByteStr { terminated } => { if !terminated { @@ -367,42 +357,28 @@ impl<'a> StringReader<'a> { ) .raise() } - let content_start = start + BytePos(2); - let content_end = suffix_start - BytePos(1); - self.validate_byte_str_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::ByteStr, id) + (token::ByteStr, Mode::ByteStr, 2, 1) // b" " } rustc_lexer::LiteralKind::RawStr(unvalidated_raw_str) => { let valid_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str); let n_hashes = valid_raw_str.num_hashes(); let n = u32::from(n_hashes); - - let content_start = start + BytePos(2 + n); - let content_end = suffix_start - BytePos(1 + n); - self.validate_raw_str_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::StrRaw(n_hashes), id) + (token::StrRaw(n_hashes), Mode::RawStr, 2 + n, 1 + n) // r##" "## } rustc_lexer::LiteralKind::RawByteStr(unvalidated_raw_str) => { let validated_raw_str = self.validate_and_report_errors(start, unvalidated_raw_str); let n_hashes = validated_raw_str.num_hashes(); let n = u32::from(n_hashes); - - let content_start = start + BytePos(3 + n); - let content_end = suffix_start - BytePos(1 + n); - self.validate_raw_byte_str_escape(content_start, content_end); - let id = self.symbol_from_to(content_start, content_end); - (token::ByteStrRaw(n_hashes), id) + (token::ByteStrRaw(n_hashes), Mode::RawByteStr, 3 + n, 1 + n) // br##" "## } rustc_lexer::LiteralKind::Int { base, empty_int } => { - if empty_int { + return if empty_int { self.err_span_(start, suffix_start, "no valid digits found for number"); (token::Integer, sym::integer(0)) } else { self.validate_int_literal(base, start, suffix_start); (token::Integer, self.symbol_from_to(start, suffix_start)) - } + }; } rustc_lexer::LiteralKind::Float { base, empty_exponent } => { if empty_exponent { @@ -430,9 +406,14 @@ impl<'a> StringReader<'a> { } let id = self.symbol_from_to(start, suffix_start); - (token::Float, id) + return (token::Float, id); } - } + }; + let content_start = start + BytePos(prefix_len); + let content_end = suffix_start - BytePos(postfix_len); + let id = self.symbol_from_to(content_start, content_end); + self.validate_literal_escape(mode, content_start, content_end); + return (lit_kind, id); } pub fn pos(&self) -> BytePos { @@ -558,96 +539,23 @@ impl<'a> StringReader<'a> { .raise(); } - fn validate_char_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - if let Err((off, err)) = unescape::unescape_char(lit) { - emit_unescape_error( - &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::Char, - 0..off, - err, - ) - } - } - - fn validate_byte_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - if let Err((off, err)) = unescape::unescape_byte(lit) { - emit_unescape_error( - &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::Byte, - 0..off, - err, - ) - } - } - - fn validate_str_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - unescape::unescape_str(lit, &mut |range, c| { - if let Err(err) = c { + fn validate_literal_escape(&self, mode: Mode, content_start: BytePos, content_end: BytePos) { + let lit_content = self.str_from_to(content_start, content_end); + unescape::unescape_literal(lit_content, mode, &mut |range, result| { + // Here we only check for errors. The actual unescaping is done later. + if let Err(err) = result { + let span_with_quotes = + self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)); emit_unescape_error( &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::Str, + lit_content, + span_with_quotes, + mode, range, err, - ) - } - }) - } - - fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - unescape::unescape_raw_str(lit, &mut |range, c| { - if let Err(err) = c { - emit_unescape_error( - &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::Str, - range, - err, - ) - } - }) - } - - fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - unescape::unescape_raw_byte_str(lit, &mut |range, c| { - if let Err(err) = c { - emit_unescape_error( - &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::ByteStr, - range, - err, - ) - } - }) - } - - fn validate_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) { - let lit = self.str_from_to(content_start, content_end); - unescape::unescape_byte_str(lit, &mut |range, c| { - if let Err(err) = c { - emit_unescape_error( - &self.sess.span_diagnostic, - lit, - self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)), - unescape::Mode::ByteStr, - range, - err, - ) + ); } - }) + }); } fn validate_int_literal(&self, base: Base, content_start: BytePos, content_end: BytePos) { diff --git a/src/tools/clippy/clippy_lints/src/write.rs b/src/tools/clippy/clippy_lints/src/write.rs index 5ad43ad55a36a..26bf463bd2922 100644 --- a/src/tools/clippy/clippy_lints/src/write.rs +++ b/src/tools/clippy/clippy_lints/src/write.rs @@ -483,8 +483,8 @@ fn check_newlines(fmtstr: &StrLit) -> bool { }; match fmtstr.style { - StrStyle::Cooked => unescape::unescape_str(contents, &mut cb), - StrStyle::Raw(_) => unescape::unescape_raw_str(contents, &mut cb), + StrStyle::Cooked => unescape::unescape_literal(contents, unescape::Mode::Str, &mut cb), + StrStyle::Raw(_) => unescape::unescape_literal(contents, unescape::Mode::RawStr, &mut cb), } should_lint