Skip to content

Commit

Permalink
Make unicode text flow control chars visible as �
Browse files Browse the repository at this point in the history
We already point these out quite aggressively, telling people not to use them, but would normally be rendered as nothing. Having them visible will make it easier for people to actually deal with them.

```
error: unicode codepoint changing visible direction of text present in literal
  --> $DIR/unicode-control-codepoints.rs:26:22
   |
LL |     println!("{:?}", '�');
   |                      ^-^
   |                      ||
   |                      |'\u{202e}'
   |                      this literal contains an invisible unicode text flow control codepoint
   |
   = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
   = help: if their presence wasn't intentional, you can remove them
help: if you want to keep them but make them visible in your source code, you can escape them
   |
LL |     println!("{:?}", '\u{202e}');
   |                       ~~~~~~~~
```

vs the previous

```
error: unicode codepoint changing visible direction of text present in literal
  --> $DIR/unicode-control-codepoints.rs:26:22
   |
LL |     println!("{:?}", '');
   |                      ^-
   |                      ||
   |                      |'\u{202e}'
   |                      this literal contains an invisible unicode text flow control codepoint
   |
   = note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
   = help: if their presence wasn't intentional, you can remove them
help: if you want to keep them but make them visible in your source code, you can escape them
   |
LL |     println!("{:?}", '\u{202e}');
   |                       ~~~~~~~~
```
  • Loading branch information
estebank committed Jul 18, 2024
1 parent 2d7795d commit 9dffe95
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 60 deletions.
21 changes: 11 additions & 10 deletions compiler/rustc_errors/src/emitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2558,18 +2558,19 @@ fn num_decimal_digits(num: usize) -> usize {
}

// We replace some characters so the CLI output is always consistent and underlines aligned.
// Keep the following list in sync with `rustc_span::char_width`.
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
('\t', " "), // We do our own tab replacement
('\t', " "), // We do our own tab replacement
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
('\u{202E}', ""),
('\u{2066}', ""),
('\u{2067}', ""),
('\u{2068}', ""),
('\u{202C}', ""),
('\u{2069}', ""),
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
('\u{202B}', ""), // supported across CLIs and can cause confusion due to the bytes on disk
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
('\u{202E}', ""),
('\u{2066}', ""),
('\u{2067}', ""),
('\u{2068}', ""),
('\u{202C}', ""),
('\u{2069}', ""),
// In terminals without Unicode support the following will be garbled, but in *all* terminals
// the underlying codepoint will be as well. We could gate this replacement behind a "unicode
// support" gate.
Expand Down
3 changes: 2 additions & 1 deletion compiler/rustc_span/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2093,7 +2093,8 @@ pub fn char_width(ch: char) -> usize {
| '\u{000E}' | '\u{000F}' | '\u{0010}' | '\u{0011}' | '\u{0012}' | '\u{0013}'
| '\u{0014}' | '\u{0015}' | '\u{0016}' | '\u{0017}' | '\u{0018}' | '\u{0019}'
| '\u{001A}' | '\u{001B}' | '\u{001C}' | '\u{001D}' | '\u{001E}' | '\u{001F}'
| '\u{007F}' => 1,
| '\u{007F}' | '\u{202A}' | '\u{202B}' | '\u{202D}' | '\u{202E}' | '\u{2066}'
| '\u{2067}' | '\u{2068}' | '\u{202C}' | '\u{2069}' => 1,
_ => unicode_width::UnicodeWidthChar::width(ch).unwrap_or(1),
}
}
Expand Down
98 changes: 49 additions & 49 deletions tests/ui/parser/unicode-control-codepoints.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -17,78 +17,78 @@ LL | println!("{:?}", b"us\u{202B}e\u{202A}r");
error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:26
|
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
LL | println!("{:?}", b"/* } if isAdmin� � begin admins only ");
| ^ must be ASCII but is '\u{202e}'
|
help: if you meant to use the UTF-8 encoding of '\u{202e}', use \xHH escapes
|
LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin begin admins only ");
LL | println!("{:?}", b"/*\xE2\x80\xAE } if isAdmin� � begin admins only ");
| ~~~~~~~~~~~~

error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:30
|
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
| ^ must be ASCII but is '\u{2066}'
LL | println!("{:?}", b"/* } if isAdmin� � begin admins only ");
| ^ must be ASCII but is '\u{2066}'
|
help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
|
LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin begin admins only ");
| ~~~~~~~~~~~~
LL | println!("{:?}", b"/* } \xE2\x81\xA6if isAdmin� � begin admins only ");
| ~~~~~~~~~~~~

error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:41
|
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
| ^ must be ASCII but is '\u{2069}'
LL | println!("{:?}", b"/* } if isAdmin� � begin admins only ");
| ^ must be ASCII but is '\u{2069}'
|
help: if you meant to use the UTF-8 encoding of '\u{2069}', use \xHH escapes
|
LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only ");
| ~~~~~~~~~~~~
LL | println!("{:?}", b"/* } if isAdmin\xE2\x81\xA9 begin admins only ");
| ~~~~~~~~~~~~

error: non-ASCII character in byte string literal
--> $DIR/unicode-control-codepoints.rs:16:43
|
LL | println!("{:?}", b"/* } if isAdmin begin admins only ");
| ^ must be ASCII but is '\u{2066}'
LL | println!("{:?}", b"/* } if isAdmin� � begin admins only ");
| ^ must be ASCII but is '\u{2066}'
|
help: if you meant to use the UTF-8 encoding of '\u{2066}', use \xHH escapes
|
LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only ");
| ~~~~~~~~~~~~
LL | println!("{:?}", b"/* } if isAdmin \xE2\x81\xA6 begin admins only ");
| ~~~~~~~~~~~~

error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:29
|
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
LL | println!("{:?}", br##"/* } if isAdmin� � begin admins only "##);
| ^ must be ASCII but is '\u{202e}'

error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:33
|
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
| ^ must be ASCII but is '\u{2066}'
LL | println!("{:?}", br##"/* } if isAdmin� � begin admins only "##);
| ^ must be ASCII but is '\u{2066}'

error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:44
|
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
| ^ must be ASCII but is '\u{2069}'
LL | println!("{:?}", br##"/* } if isAdmin� � begin admins only "##);
| ^ must be ASCII but is '\u{2069}'

error: non-ASCII character in raw byte string literal
--> $DIR/unicode-control-codepoints.rs:21:46
|
LL | println!("{:?}", br##"/* } if isAdmin begin admins only "##);
| ^ must be ASCII but is '\u{2066}'
LL | println!("{:?}", br##"/* } if isAdmin� � begin admins only "##);
| ^ must be ASCII but is '\u{2066}'

error: unicode codepoint changing visible direction of text present in comment
--> $DIR/unicode-control-codepoints.rs:2:5
|
LL | // if access_level != "user" { // Check if admin
| ^^^^^^^^^^^^^^^^^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
| | ||
| | |'\u{202a}'
LL | // if access_level != "us�e�r" { // Check if admin
| ^^^^^^^^^^^^^^^^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
| | | |
| | | '\u{202a}'
| | '\u{202b}'
| this comment contains invisible unicode text flow control codepoints
|
Expand All @@ -99,12 +99,12 @@ LL | // if access_level != "user" { // Check if admin
error: unicode codepoint changing visible direction of text present in comment
--> $DIR/unicode-control-codepoints.rs:30:1
|
LL | //"/* } if isAdmin begin admins only */"
| ^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
| | | | ||
| | | | |'\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
LL | //"/* } if isAdmin� � begin admins only */"
| ^^^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
| | | | | |
| | | | | '\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
| | '\u{202e}'
| this comment contains invisible unicode text flow control codepoints
|
Expand All @@ -114,12 +114,12 @@ LL | //"/* } if isAdmin begin admins only */"
error: unicode codepoint changing visible direction of text present in literal
--> $DIR/unicode-control-codepoints.rs:11:22
|
LL | println!("{:?}", "/* } if isAdmin begin admins only ");
| ^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^
| | | | ||
| | | | |'\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
LL | println!("{:?}", "/* } if isAdmin� � begin admins only ");
| ^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^
| | | | | |
| | | | | '\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
| | '\u{202e}'
| this literal contains invisible unicode text flow control codepoints
|
Expand All @@ -134,12 +134,12 @@ LL | println!("{:?}", "/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} begi
error: unicode codepoint changing visible direction of text present in literal
--> $DIR/unicode-control-codepoints.rs:14:22
|
LL | println!("{:?}", r##"/* } if isAdmin begin admins only "##);
| ^^^^^^-^^-^^^^^^^^^--^^^^^^^^^^^^^^^^^^^^^
| | | | ||
| | | | |'\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
LL | println!("{:?}", r##"/* } if isAdmin� � begin admins only "##);
| ^^^^^^-^^^-^^^^^^^^^^-^-^^^^^^^^^^^^^^^^^^^^^^
| | | | | |
| | | | | '\u{2066}'
| | | | '\u{2069}'
| | | '\u{2066}'
| | '\u{202e}'
| this literal contains invisible unicode text flow control codepoints
|
Expand All @@ -153,8 +153,8 @@ LL | println!("{:?}", r##"/*\u{202e} } \u{2066}if isAdmin\u{2069} \u{2066} b
error: unicode codepoint changing visible direction of text present in literal
--> $DIR/unicode-control-codepoints.rs:26:22
|
LL | println!("{:?}", '');
| ^-
LL | println!("{:?}", '');
| ^-^
| ||
| |'\u{202e}'
| this literal contains an invisible unicode text flow control codepoint
Expand All @@ -169,8 +169,8 @@ LL | println!("{:?}", '\u{202e}');
error: unicode codepoint changing visible direction of text present in doc comment
--> $DIR/unicode-control-codepoints.rs:33:1
|
LL | /** ''); */fn foo() {}
| ^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint
LL | /** ''); */fn foo() {}
| ^^^^^^^^^^^^^ this doc comment contains an invisible unicode text flow control codepoint
|
= note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
= note: if their presence wasn't intentional, you can remove them
Expand All @@ -181,8 +181,8 @@ error: unicode codepoint changing visible direction of text present in doc comme
|
LL | / /**
LL | | *
LL | | * ''); */fn bar() {}
| |___________^ this doc comment contains an invisible unicode text flow control codepoint
LL | | * ''); */fn bar() {}
| |____________^ this doc comment contains an invisible unicode text flow control codepoint
|
= note: these kind of unicode codepoints change the way text flows on applications that support them, but can cause confusion because they change the order of characters on the screen
= note: if their presence wasn't intentional, you can remove them
Expand Down

0 comments on commit 9dffe95

Please sign in to comment.