Skip to content

tr: implement support for non-UTF-8 input #354

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion text/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ libc.workspace = true
notify-debouncer-full = "0.3"
diff = "0.1"
dirs = "5.0"
deunicode = "1.6"
walkdir = "2"

[dev-dependencies]
Expand Down
172 changes: 162 additions & 10 deletions text/tests/tr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,37 @@
// SPDX-License-Identifier: MIT
//

use plib::testing::{run_test, TestPlan};
use plib::testing::{run_test, run_test_u8, TestPlan, TestPlanU8};

fn tr_test_binary(args: &[&str], test_data: &[u8], expected_output: &[u8]) {
let str_args = args
.iter()
.map(|st| st.to_owned().to_owned())
.collect::<Vec<_>>();

run_test_u8(TestPlanU8 {
cmd: "tr".to_owned(),
args: str_args,
stdin_data: test_data.to_owned(),
expected_out: expected_output.to_owned(),
expected_err: Vec::<u8>::new(),
expected_exit_code: 0_i32,
});
}

fn tr_test(args: &[&str], test_data: &str, expected_output: &str) {
let str_args = args
.iter()
.map(|st| st.to_owned().to_owned())
.collect::<Vec<String>>();
.collect::<Vec<_>>();

run_test(TestPlan {
cmd: "tr".to_owned(),
args: str_args,
stdin_data: test_data.to_owned(),
expected_out: expected_output.to_owned(),
expected_err: String::new(),
expected_exit_code: 0,
expected_exit_code: 0_i32,
});
}

Expand All @@ -38,7 +54,7 @@ fn tr_bad_arguments_failure_test(args: &[&str], expected_stderr: &str) {
stdin_data: String::new(),
expected_out: String::new(),
expected_err: expected_stderr.to_owned(),
expected_exit_code: 1,
expected_exit_code: 1_i32,
});
}

Expand Down Expand Up @@ -482,23 +498,23 @@ fn tr_bad_octal_range() {
fn tr_bad_x_n_construct_decimal() {
tr_bad_arguments_failure_test(
&["-d", "[a*100000000000000000000]"],
"tr: invalid repeat count 100000000000000000000 in [c*n] construct\n",
"tr: invalid repeat count '100000000000000000000' in [c*n] construct\n",
);
}

#[test]
fn tr_bad_x_n_construct_octal() {
tr_bad_arguments_failure_test(
&["-d", "[a*010000000000000000000000]"],
"tr: invalid repeat count 010000000000000000000000 in [c*n] construct\n",
"tr: invalid repeat count '010000000000000000000000' in [c*n] construct\n",
);
}

#[test]
fn tr_bad_x_n_construct_non_decimal_non_octal() {
tr_bad_arguments_failure_test(
&["-d", "[a*a]"],
"tr: invalid repeat count ‘a’ in [c*n] construct\n",
"tr: invalid repeat count 'a' in [c*n] construct\n",
);
}

Expand Down Expand Up @@ -597,7 +613,7 @@ fn tr_equivalence_class_low_priority() {
fn tr_arguments_validation_error_message_format() {
tr_bad_arguments_failure_test(
&["a"],
"tr: missing operand after ‘a’. Two strings must be given when translating.\n",
"tr: missing operand after 'a'. Two strings must be given when translating.\n",
);
}

Expand Down Expand Up @@ -633,11 +649,147 @@ fn tr_minimal_d_s() {
fn tr_missing_equiv() {
tr_bad_arguments_failure_test(
&["-d", "[==]"],
"tr: missing equivalence class character '[==]'\n",
"tr: input '[==]' is invalid: missing equivalence class character\n",
);
}

#[test]
fn tr_missing_character_class() {
tr_bad_arguments_failure_test(&["-d", "[::]"], "tr: missing character class name '[::]'\n");
tr_bad_arguments_failure_test(
&["-d", "[::]"],
"tr: input '[::]' is invalid: missing character class name\n",
);
}

#[test]
fn tr_8_bit() {
tr_test_binary(&[r"\377", "A"], b"\xFF", b"A");
}

#[test]
fn tr_multi_byte_utf_8() {
tr_test(&["-d", "ᛆᚠ"], "ᛆᚠᛏᚢᛆᛘᚢᚦᛌᛏᚭᚿᛏᛆᚱᚢᚿᛆᛧᚦᛆᛧ", "ᛏᚢᛘᚢᚦᛌᛏᚭᚿᛏᚱᚢᚿᛧᚦᛧ");
}

#[test]
fn tr_c_d_s_squeeze_not_complemented() {
tr_test(&["-c", "-d", "-s", "D", "D"], "DDD AAABBBCCC DDD", "D");
}

#[test]
fn tr_squeeze_independent_of_translation() {
tr_test(&["-s", "1", "23"], "111 222 333", "2 2 3");
}

#[test]
fn tr_complemented_squeeze_independent_of_translation() {
tr_test(&["-c", "-s", "1", "23"], "111 222 333", "1113");
}

#[test]
fn tr_c_s_as_many_as_needed() {
tr_test(&["-c", "-s", "B", "[d*]"], "AAA BBB CCC", "dBBBd");
}

// Only BusyBox and uutils' coreutils handle this "correctly"
// bsdutils runs forever (or a very long time)
// GNU Core Utilities rejects this because of the "[d*] argument"
#[test]
fn tr_non_standard_d_s() {
tr_test(&["-d", "-s", "B", "[C*]"], "AAA BBB CCC DDD", "AAA C DDD");
}

// Different from bsdutils, but bsdutils doesn't handle 8-bit non-UTF-8 data
#[test]
fn tr_multi_byte_complement_translation() {
tr_test(&["-c", "ᛏ", "A"], "ᛆᚠᛏ", "AAAAAAᛏ");
}

#[test]
fn tr_multi_byte_indexing_check() {
tr_test(&["-c", "ᛏ", "B"], "ᛏA", "ᛏB");
}

// BusyBox does not parse escape backslash/escape sequences inside [x*n] constructs
// Other implementations do
#[test]
fn tr_slash_n_as_many_as_needed() {
tr_test(
&["b-c", r"[\n*]"],
"The big black fox jumped over the fence",
"\
The
ig
la
k fox jumped over the fen
e",
);
}

#[test]
fn tr_slash_n_broken_x_n_construct() {
tr_test(
&["b-e", r"[\nZ]"],
"The big black fox jumped over the fence",
"\
Th] [ig [la
k fox jump]Z ov]r th] f]n
]",
);
}

#[test]
fn tr_octal_in_as_many_as_needed() {
tr_test(
&["a-d", r"[\123*2]9"],
"The big black fox jumped over the fence",
"The Sig SlS9k fox jumpe9 over the fen9e",
);
}

#[test]
fn tr_invalid_octal_in_as_many_as_needed() {
tr_test(
&["a-d", r"[\128*2]"],
"The big black fox jumped over the fence",
"\
The
ig
l[8k fox jumpe* over the fen8e",
);
}

#[test]
fn tr_invalid_multi_byte_range() {
tr_bad_arguments_failure_test(
&["-d", "ᛆ-ᚦ"],
r"tr: range-endpoints of '\u{16c6}-\u{16a6}' are in reverse collating sequence order
",
);
}

#[test]
fn tr_multi_byte_range() {
tr_test(
&["-d", "ᚢ-ᛆ"],
"A ᛆᚠᛏᚢᛆᛘᚢᚦᛌᛏᚭᚿᛏᛆᚱᚢᚿᛆᛧᚦᛆᛧ B",
"\
A ᚠᛏᛘᛌᛏᛏᛧᛧ B",
);
}

#[test]
fn tr_multi_byte_squeeze_translate() {
tr_test(&["-s", "ᚢ", "A"], "123 ᚢᚢᚢᚢᚢᚢ 456", "123 A 456");
}

#[test]
fn tr_dash_d_two_strings() {
tr_bad_arguments_failure_test(
&["-d", "A", "B"],
"\
tr: extra operand 'B'
Only one string may be given when deleting without squeezing repeats.
",
);
}
Loading