Skip to content

Commit d33aca1

Browse files
GunterSchmidtGunter Schmidtsylvestre
authored
cmp Feat: change data type for 'bytes' limit and 'ignore initial' to u64 (#183)
* feat: u64 for --bytes and --ignore-initial fix: bumped up tempfile to "3.26.0" The variables for --bytes, --ignore-initial and line count where size 'usize', thus limiting the readable bytes on 32-bit systems. GNU cmp is compiled with LFS (Large File Support) and allows i64 values. This is now all u64, which works also on 32-bit systems with Rust. There is no reason to implement a 32-bit barrier for 32 bit machines. Additionally the --bytes limit can be set to 'u128' using the feature "cmp_bytes_limit_128_bit". The performance impact would be negligible, as there only few calculations each time a full block is read from the file. --------- Co-authored-by: Gunter Schmidt <gsgit@beadsoft.de> Co-authored-by: Sylvestre Ledru <sylvestre@debian.org>
1 parent 6491790 commit d33aca1

1 file changed

Lines changed: 63 additions & 48 deletions

File tree

src/cmp.rs

Lines changed: 63 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,20 @@ use std::os::unix::fs::MetadataExt;
2020
#[cfg(target_os = "windows")]
2121
use std::os::windows::fs::MetadataExt;
2222

23+
/// for --bytes, so really large number limits can be expressed, like 1Y.
24+
pub type BytesLimitU64 = u64;
25+
// ignore initial is currently limited to u64, as take(skip) is used.
26+
pub type SkipU64 = u64;
27+
2328
#[derive(Clone, Debug, Default, Eq, PartialEq)]
2429
pub struct Params {
2530
executable: OsString,
2631
from: OsString,
2732
to: OsString,
2833
print_bytes: bool,
29-
skip_a: Option<usize>,
30-
skip_b: Option<usize>,
31-
max_bytes: Option<usize>,
34+
skip_a: Option<SkipU64>,
35+
skip_b: Option<SkipU64>,
36+
max_bytes: Option<BytesLimitU64>,
3237
verbose: bool,
3338
quiet: bool,
3439
}
@@ -71,13 +76,13 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
7176
};
7277
let executable_str = executable.to_string_lossy().to_string();
7378

74-
let parse_skip = |param: &str, skip_desc: &str| -> Result<usize, String> {
79+
let parse_skip = |param: &str, skip_desc: &str| -> Result<SkipU64, String> {
7580
let suffix_start = param
7681
.find(|b: char| !b.is_ascii_digit())
7782
.unwrap_or(param.len());
78-
let mut num = match param[..suffix_start].parse::<usize>() {
83+
let mut num = match param[..suffix_start].parse::<SkipU64>() {
7984
Ok(num) => num,
80-
Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => usize::MAX,
85+
Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => SkipU64::MAX,
8186
Err(_) => {
8287
return Err(format!(
8388
"{executable_str}: invalid --ignore-initial value '{skip_desc}'"
@@ -88,33 +93,24 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
8893
if suffix_start != param.len() {
8994
// Note that GNU cmp advertises supporting up to Y, but fails if you try
9095
// to actually use anything beyond E.
91-
let multiplier: usize = match &param[suffix_start..] {
96+
let multiplier: SkipU64 = match &param[suffix_start..] {
9297
"kB" => 1_000,
9398
"K" => 1_024,
9499
"MB" => 1_000_000,
95100
"M" => 1_048_576,
96101
"GB" => 1_000_000_000,
97102
"G" => 1_073_741_824,
98-
// This only generates a warning when compiling for target_pointer_width < 64
99-
#[allow(unused_variables)]
100-
suffix @ ("TB" | "T" | "PB" | "P" | "EB" | "E") => {
101-
#[cfg(target_pointer_width = "64")]
102-
match suffix {
103-
"TB" => 1_000_000_000_000,
104-
"T" => 1_099_511_627_776,
105-
"PB" => 1_000_000_000_000_000,
106-
"P" => 1_125_899_906_842_624,
107-
"EB" => 1_000_000_000_000_000_000,
108-
"E" => 1_152_921_504_606_846_976,
109-
_ => unreachable!(),
110-
}
111-
#[cfg(not(target_pointer_width = "64"))]
112-
usize::MAX
113-
}
114-
"ZB" => usize::MAX, // 1_000_000_000_000_000_000_000,
115-
"Z" => usize::MAX, // 1_180_591_620_717_411_303_424,
116-
"YB" => usize::MAX, // 1_000_000_000_000_000_000_000_000,
117-
"Y" => usize::MAX, // 1_208_925_819_614_629_174_706_176,
103+
"TB" => 1_000_000_000_000,
104+
"T" => 1_099_511_627_776,
105+
"PB" => 1_000_000_000_000_000,
106+
"P" => 1_125_899_906_842_624,
107+
"EB" => 1_000_000_000_000_000_000,
108+
"E" => 1_152_921_504_606_846_976,
109+
// TODO setting usize:MAX does not mimic GNU cmp behavior, it should be an error.
110+
"ZB" => SkipU64::MAX, // 1_000_000_000_000_000_000_000,
111+
"Z" => SkipU64::MAX, // 1_180_591_620_717_411_303_424,
112+
"YB" => SkipU64::MAX, // 1_000_000_000_000_000_000_000_000,
113+
"Y" => SkipU64::MAX, // 1_208_925_819_614_629_174_706_176,
118114
_ => {
119115
return Err(format!(
120116
"{executable_str}: invalid --ignore-initial value '{skip_desc}'"
@@ -124,7 +120,7 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
124120

125121
num = match num.overflowing_mul(multiplier) {
126122
(n, false) => n,
127-
_ => usize::MAX,
123+
_ => SkipU64::MAX,
128124
}
129125
}
130126

@@ -178,9 +174,10 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
178174
let (_, arg) = param_str.split_once('=').unwrap();
179175
arg.to_string()
180176
};
181-
let max_bytes = match max_bytes.parse::<usize>() {
177+
let max_bytes = match max_bytes.parse::<BytesLimitU64>() {
182178
Ok(num) => num,
183-
Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => usize::MAX,
179+
// TODO limit to MAX is dangerous, this should become an error like in GNU cmp.
180+
Err(e) if *e.kind() == std::num::IntErrorKind::PosOverflow => BytesLimitU64::MAX,
184181
Err(_) => {
185182
return Err(format!(
186183
"{executable_str}: invalid --bytes value '{max_bytes}'"
@@ -238,7 +235,7 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
238235
}
239236

240237
// Do as GNU cmp, and completely disable printing if we are
241-
// outputing to /dev/null.
238+
// outputting to /dev/null.
242239
#[cfg(not(target_os = "windows"))]
243240
if is_stdout_dev_null() {
244241
params.quiet = true;
@@ -290,7 +287,7 @@ pub fn parse_params<I: Iterator<Item = OsString>>(mut opts: Peekable<I>) -> Resu
290287

291288
fn prepare_reader(
292289
path: &OsString,
293-
skip: &Option<usize>,
290+
skip: &Option<SkipU64>,
294291
params: &Params,
295292
) -> Result<Box<dyn BufRead>, String> {
296293
let mut reader: Box<dyn BufRead> = if path == "-" {
@@ -309,7 +306,8 @@ fn prepare_reader(
309306
};
310307

311308
if let Some(skip) = skip {
312-
if let Err(e) = io::copy(&mut reader.by_ref().take(*skip as u64), &mut io::sink()) {
309+
// cast as u64 must remain, because value of IgnInit data type could be changed.
310+
if let Err(e) = io::copy(&mut reader.by_ref().take(*skip), &mut io::sink()) {
313311
return Err(format_failure_to_read_input_file(
314312
&params.executable,
315313
path,
@@ -331,7 +329,7 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
331329
let mut from = prepare_reader(&params.from, &params.skip_a, params)?;
332330
let mut to = prepare_reader(&params.to, &params.skip_b, params)?;
333331

334-
let mut offset_width = params.max_bytes.unwrap_or(usize::MAX);
332+
let mut offset_width = params.max_bytes.unwrap_or(BytesLimitU64::MAX);
335333

336334
if let (Ok(a_meta), Ok(b_meta)) = (fs::metadata(&params.from), fs::metadata(&params.to)) {
337335
#[cfg(not(target_os = "windows"))]
@@ -346,7 +344,7 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
346344
return Ok(Cmp::Different);
347345
}
348346

349-
let smaller = cmp::min(a_size, b_size) as usize;
347+
let smaller = cmp::min(a_size, b_size) as BytesLimitU64;
350348
offset_width = cmp::min(smaller, offset_width);
351349
}
352350

@@ -355,8 +353,8 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
355353
// Capacity calc: at_byte width + 2 x 3-byte octal numbers + 2 x 4-byte value + 4 spaces
356354
let mut output = Vec::<u8>::with_capacity(offset_width + 3 * 2 + 4 * 2 + 4);
357355

358-
let mut at_byte = 1;
359-
let mut at_line = 1;
356+
let mut at_byte: BytesLimitU64 = 1;
357+
let mut at_line: u64 = 1;
360358
let mut start_of_line = true;
361359
let mut stdout = BufWriter::new(io::stdout().lock());
362360
let mut compare = Cmp::Equal;
@@ -406,8 +404,8 @@ pub fn cmp(params: &Params) -> Result<Cmp, String> {
406404
if from_buf[..consumed] == to_buf[..consumed] {
407405
let last = from_buf[..consumed].last().unwrap();
408406

409-
at_byte += consumed;
410-
at_line += from_buf[..consumed].iter().filter(|&c| *c == b'\n').count();
407+
at_byte += consumed as BytesLimitU64;
408+
at_line += (from_buf[..consumed].iter().filter(|&c| *c == b'\n').count()) as u64;
411409

412410
start_of_line = *last == b'\n';
413411

@@ -595,7 +593,7 @@ fn format_visible_byte(byte: u8) -> String {
595593
fn format_verbose_difference(
596594
from_byte: u8,
597595
to_byte: u8,
598-
at_byte: usize,
596+
at_byte: BytesLimitU64,
599597
offset_width: usize,
600598
output: &mut Vec<u8>,
601599
params: &Params,
@@ -660,7 +658,13 @@ fn format_verbose_difference(
660658
}
661659

662660
#[inline]
663-
fn report_eof(at_byte: usize, at_line: usize, start_of_line: bool, eof_on: &str, params: &Params) {
661+
fn report_eof(
662+
at_byte: BytesLimitU64,
663+
at_line: u64,
664+
start_of_line: bool,
665+
eof_on: &str,
666+
params: &Params,
667+
) {
664668
if params.quiet {
665669
return;
666670
}
@@ -712,7 +716,13 @@ fn is_posix_locale() -> bool {
712716
}
713717

714718
#[inline]
715-
fn report_difference(from_byte: u8, to_byte: u8, at_byte: usize, at_line: usize, params: &Params) {
719+
fn report_difference(
720+
from_byte: u8,
721+
to_byte: u8,
722+
at_byte: BytesLimitU64,
723+
at_line: u64,
724+
params: &Params,
725+
) {
716726
if params.quiet {
717727
return;
718728
}
@@ -809,7 +819,7 @@ mod tests {
809819
from: os("foo"),
810820
to: os("bar"),
811821
skip_a: Some(1),
812-
skip_b: Some(usize::MAX),
822+
skip_b: Some(SkipU64::MAX),
813823
..Default::default()
814824
}),
815825
parse_params(
@@ -987,7 +997,7 @@ mod tests {
987997
executable: os("cmp"),
988998
from: os("foo"),
989999
to: os("bar"),
990-
max_bytes: Some(usize::MAX),
1000+
max_bytes: Some(BytesLimitU64::MAX),
9911001
..Default::default()
9921002
}),
9931003
parse_params(
@@ -1004,6 +1014,7 @@ mod tests {
10041014
);
10051015

10061016
// Failure case
1017+
// TODO This is actually fine in GNU cmp. --bytes does not have a unit parser yet.
10071018
assert_eq!(
10081019
Err("cmp: invalid --bytes value '1K'".to_string()),
10091020
parse_params(
@@ -1049,8 +1060,8 @@ mod tests {
10491060
executable: os("cmp"),
10501061
from: os("foo"),
10511062
to: os("bar"),
1052-
skip_a: Some(usize::MAX),
1053-
skip_b: Some(usize::MAX),
1063+
skip_a: Some(SkipU64::MAX),
1064+
skip_b: Some(SkipU64::MAX),
10541065
..Default::default()
10551066
}),
10561067
parse_params(
@@ -1124,8 +1135,12 @@ mod tests {
11241135
.enumerate()
11251136
{
11261137
let values = [
1127-
1_000usize.checked_pow((i + 1) as u32).unwrap_or(usize::MAX),
1128-
1024usize.checked_pow((i + 1) as u32).unwrap_or(usize::MAX),
1138+
(1_000 as SkipU64)
1139+
.checked_pow((i + 1) as u32)
1140+
.unwrap_or(SkipU64::MAX),
1141+
(1024 as SkipU64)
1142+
.checked_pow((i + 1) as u32)
1143+
.unwrap_or(SkipU64::MAX),
11291144
];
11301145
for (j, v) in values.iter().enumerate() {
11311146
assert_eq!(

0 commit comments

Comments
 (0)