Skip to content

Commit b77d1d0

Browse files
committed
Stop crashing on invalid Unicode input
Unlike C++, Rust requires "char" to be a valid Unicode code point. As a workaround, we take the raw (probably UTF-8-encoded) input and convert each input byte to a char representation from the private use area (see commit 3b15e99 (str2wcs: encode invalid Unicode characters in the private use area, 2023-04-01)). We convert back whenever we output the string, which is correct as long as the encoding didn't change since the data was input. We also need to convert keyboard input; do that. Quick testing shows that our reader drops PUA characters. Since this patch converts both invalid Unicode input as well as PUA input into a safe PUA representation, there's no longer a reason to not add PUA characters to the commandline, so let's do that to restore traditional behavior. Render them as � (REPLACEMENT CHARACTER); unfortunately we show one per input byte instead of one per code point. To fix this we probably need our own char type. While at it, remove some special cases that try to prevent insertion of control characters. I don't think they are necessary. Could be wrong..
1 parent f60b6e6 commit b77d1d0

File tree

3 files changed

+32
-28
lines changed

3 files changed

+32
-28
lines changed

src/input_common.rs

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
1-
use crate::common::{is_windows_subsystem_for_linux, read_blocked};
1+
use crate::common::{fish_reserved_codepoint, is_windows_subsystem_for_linux, read_blocked};
22
use crate::env::{EnvStack, Environment};
33
use crate::fd_readable_set::FdReadableSet;
44
use crate::flog::FLOG;
55
use crate::reader::reader_current_data;
66
use crate::threads::{iothread_port, iothread_service_main};
77
use crate::universal_notifier::default_notifier;
8-
use crate::wchar::prelude::*;
8+
use crate::wchar::{encode_byte_to_char, prelude::*};
99
use crate::wutil::encoding::{mbrtowc, zero_mbstate};
1010
use crate::wutil::fish_wcstol;
1111
use std::collections::VecDeque;
@@ -374,6 +374,8 @@ pub trait InputEventQueuer {
374374
fn readch(&mut self) -> CharEvent {
375375
let mut res: char = '\0';
376376
let mut state = zero_mbstate();
377+
let mut bytes = [0; 64 * 16];
378+
let mut num_bytes = 0;
377379
loop {
378380
// Do we have something enqueued already?
379381
// Note this may be initially true, or it may become true through calls to
@@ -415,9 +417,10 @@ pub trait InputEventQueuer {
415417
res = read_byte.into();
416418
return CharEvent::from_char(res);
417419
}
420+
let mut codepoint = u32::from(res);
418421
let sz = unsafe {
419422
mbrtowc(
420-
std::ptr::addr_of_mut!(res).cast(),
423+
std::ptr::addr_of_mut!(codepoint).cast(),
421424
std::ptr::addr_of!(read_byte).cast(),
422425
1,
423426
&mut state,
@@ -430,16 +433,30 @@ pub trait InputEventQueuer {
430433
}
431434
-2 => {
432435
// Sequence not yet complete.
436+
bytes[num_bytes] = read_byte;
437+
num_bytes += 1;
438+
continue;
433439
}
434440
0 => {
435441
// Actual nul char.
436442
return CharEvent::from_char('\0');
437443
}
438-
_ => {
439-
// Sequence complete.
444+
_ => (),
445+
}
446+
if let Some(res) = char::from_u32(codepoint) {
447+
// Sequence complete.
448+
if !fish_reserved_codepoint(res) {
440449
return CharEvent::from_char(res);
441450
}
442451
}
452+
bytes[num_bytes] = read_byte;
453+
num_bytes += 1;
454+
for &b in &bytes[1..num_bytes] {
455+
let c = CharEvent::from_char(encode_byte_to_char(b));
456+
self.push_back(c);
457+
}
458+
let res = CharEvent::from_char(encode_byte_to_char(bytes[0]));
459+
return res;
443460
}
444461
}
445462
}

src/reader.rs

Lines changed: 6 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,10 @@ use crate::ast::{self, Ast, Category, Traversal};
3939
use crate::builtins::shared::STATUS_CMD_OK;
4040
use crate::color::RgbColor;
4141
use crate::common::{
42-
escape, escape_string, exit_without_destructors, fish_reserved_codepoint, get_ellipsis_char,
43-
get_obfuscation_read_char, redirect_tty_output, scoped_push_replacer, scoped_push_replacer_ctx,
44-
shell_modes, str2wcstring, wcs2string, write_loop, EscapeFlags, EscapeStringStyle, ScopeGuard,
45-
PROGRAM_NAME, UTF8_BOM_WCHAR,
42+
escape, escape_string, exit_without_destructors, get_ellipsis_char, get_obfuscation_read_char,
43+
redirect_tty_output, scoped_push_replacer, scoped_push_replacer_ctx, shell_modes, str2wcstring,
44+
wcs2string, write_loop, EscapeFlags, EscapeStringStyle, ScopeGuard, PROGRAM_NAME,
45+
UTF8_BOM_WCHAR,
4646
};
4747
use crate::complete::{
4848
complete, complete_load, sort_and_prioritize, CompleteFlags, Completion, CompletionList,
@@ -1844,10 +1844,7 @@ impl ReaderData {
18441844
&& zelf.active_edit_line().1.position() == 0
18451845
{
18461846
// This character is skipped.
1847-
} else if !fish_reserved_codepoint(c)
1848-
&& (c >= ' ' || c == '\n' || c == '\r')
1849-
&& c != '\x7F'
1850-
{
1847+
} else {
18511848
// Regular character.
18521849
let (elt, _el) = zelf.active_edit_line();
18531850
zelf.insert_char(elt, c);
@@ -1857,10 +1854,6 @@ impl ReaderData {
18571854
// We end history search. We could instead update the search string.
18581855
zelf.history_search.reset();
18591856
}
1860-
} else {
1861-
// This can happen if the user presses a control char we don't recognize. No
1862-
// reason to report this to the user unless they've enabled debugging output.
1863-
FLOG!(reader, wgettext_fmt!("Unknown key binding 0x%X", c));
18641857
}
18651858
rls.last_cmd = None;
18661859
}
@@ -1976,7 +1969,7 @@ impl ReaderData {
19761969
self.inputter
19771970
.read_char(allow_commands.then_some(&mut command_handler))
19781971
};
1979-
if !event_is_normal_char(&evt) || !poll_fd_readable(self.conf.inputfd) {
1972+
if !evt.is_char() || !poll_fd_readable(self.conf.inputfd) {
19801973
event_needing_handling = Some(evt);
19811974
break;
19821975
} else if evt.input_style == CharInputStyle::NotFirst
@@ -5334,12 +5327,3 @@ impl ReaderData {
53345327
self.set_buffer_maintaining_pager(&new_command_line, cursor, false);
53355328
}
53365329
}
5337-
5338-
/// \return true if an event is a normal character that should be inserted into the buffer.
5339-
fn event_is_normal_char(evt: &CharEvent) -> bool {
5340-
if !evt.is_char() {
5341-
return false;
5342-
}
5343-
let c = evt.get_char();
5344-
!fish_reserved_codepoint(c) && c > char::from(31) && c != char::from(127)
5345-
}

src/screen.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use std::sync::Mutex;
1717
use libc::{ONLCR, STDERR_FILENO, STDOUT_FILENO};
1818

1919
use crate::common::{
20-
get_ellipsis_char, get_omitted_newline_str, get_omitted_newline_width,
20+
fish_reserved_codepoint, get_ellipsis_char, get_omitted_newline_str, get_omitted_newline_width,
2121
has_working_tty_timestamps, shell_modes, str2wcstring, wcs2string, write_loop, ScopeGuard,
2222
ScopeGuarding,
2323
};
@@ -1825,6 +1825,9 @@ fn compute_layout(
18251825
// \n.
18261826
// See https://unicode-table.com/en/blocks/control-pictures/
18271827
fn rendered_character(c: char) -> char {
1828+
if fish_reserved_codepoint(c) {
1829+
return '�'; // replacement character
1830+
}
18281831
if c <= '\x1F' {
18291832
char::from_u32(u32::from(c) + 0x2400).unwrap()
18301833
} else {

0 commit comments

Comments
 (0)