Skip to content

Commit 4e388da

Browse files
ezekielnewrengitster
authored andcommitted
xdiff: implement a white space iterator in Rust
Xdiff has traditionally implemented the logic for iterating over whitespace in every location that needed to do so. Create a consolidated iterator in Rust that we can call from each location. Write Rust unit tests to ensure the correctness of the Rust whitespace iterator and the chunked_iter_equal() function. Signed-off-by: Ezekiel Newren <ezekielnewren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
1 parent 992477f commit 4e388da

File tree

2 files changed

+302
-0
lines changed

2 files changed

+302
-0
lines changed

rust/xdiff/src/lib.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
pub mod xutils;
2+
3+
pub const XDF_IGNORE_WHITESPACE: u64 = 1 << 1;
4+
pub const XDF_IGNORE_WHITESPACE_CHANGE: u64 = 1 << 2;
5+
pub const XDF_IGNORE_WHITESPACE_AT_EOL: u64 = 1 << 3;
6+
pub const XDF_IGNORE_CR_AT_EOL: u64 = 1 << 4;
7+
pub const XDF_WHITESPACE_FLAGS: u64 = XDF_IGNORE_WHITESPACE |
8+
XDF_IGNORE_WHITESPACE_CHANGE |
9+
XDF_IGNORE_WHITESPACE_AT_EOL |
10+
XDF_IGNORE_CR_AT_EOL;
111

212

313
#[no_mangle]

rust/xdiff/src/xutils.rs

Lines changed: 292 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,292 @@
1+
use crate::*;
2+
3+
pub(crate) fn xdl_isspace(v: u8) -> bool {
4+
match v {
5+
b'\t' | b'\n' | b'\r' | b' ' => true,
6+
_ => false,
7+
}
8+
}
9+
10+
pub struct WhitespaceIter<'a> {
11+
line: &'a [u8],
12+
index: usize,
13+
flags: u64,
14+
}
15+
16+
17+
impl<'a> WhitespaceIter<'a> {
18+
pub fn new(line: &'a [u8], flags: u64) -> Self {
19+
Self {
20+
line,
21+
index: 0,
22+
flags,
23+
}
24+
}
25+
}
26+
27+
impl<'a> Iterator for WhitespaceIter<'a> {
28+
type Item = &'a [u8];
29+
30+
fn next(&mut self) -> Option<Self::Item> {
31+
if self.index >= self.line.len() {
32+
return None;
33+
}
34+
35+
loop {
36+
let start = self.index;
37+
if self.index == self.line.len() {
38+
return None;
39+
}
40+
41+
/* return contiguous run of not space bytes */
42+
while self.index < self.line.len() {
43+
if xdl_isspace(self.line[self.index]) {
44+
break;
45+
}
46+
self.index += 1;
47+
}
48+
if self.index > start {
49+
return Some(&self.line[start..self.index]);
50+
}
51+
/* the current byte had better be a space */
52+
if !xdl_isspace(self.line[self.index]) {
53+
panic!("xdl_line_iter_next xdl_isspace() is false")
54+
}
55+
56+
while self.index < self.line.len() && xdl_isspace(self.line[self.index]) {
57+
self.index += 1;
58+
}
59+
60+
61+
if self.index <= start {
62+
panic!("xdl_isspace() cannot simultaneously be true and false");
63+
}
64+
65+
if (self.flags & XDF_IGNORE_WHITESPACE_AT_EOL) != 0
66+
&& self.index == self.line.len()
67+
{
68+
return None;
69+
}
70+
if (self.flags & XDF_IGNORE_WHITESPACE) != 0 {
71+
continue;
72+
}
73+
if (self.flags & XDF_IGNORE_WHITESPACE_CHANGE) != 0 {
74+
if self.index == self.line.len() {
75+
continue;
76+
}
77+
return Some(" ".as_bytes());
78+
}
79+
if (self.flags & XDF_IGNORE_CR_AT_EOL) != 0 {
80+
if start < self.line.len() && self.index == self.line.len() {
81+
let mut end = self.line.len();
82+
if end > 0 && self.line[end - 1] == b'\n' {
83+
if end - start == 1 {
84+
return Some(&self.line[start..end]);
85+
} else {
86+
end -= 1;
87+
}
88+
if end > 0 && self.line[end - 1] == b'\r' {
89+
self.index = end;
90+
end -= 1;
91+
if end - start == 0 {
92+
continue;
93+
}
94+
return Some(&self.line[start..end]);
95+
}
96+
}
97+
}
98+
}
99+
return Some(&self.line[start..self.index]);
100+
}
101+
}
102+
}
103+
104+
pub fn chunked_iter_equal<'a, T, IT0, IT1>(mut it0: IT0, mut it1: IT1) -> bool
105+
where
106+
T: Eq + 'a,
107+
IT0: Iterator<Item = &'a [T]>,
108+
IT1: Iterator<Item = &'a [T]>,
109+
{
110+
let mut run_option0: Option<&[T]> = it0.next();
111+
let mut run_option1: Option<&[T]> = it1.next();
112+
let mut i0 = 0;
113+
let mut i1 = 0;
114+
115+
while let (Some(run0), Some(run1)) = (run_option0, run_option1) {
116+
while i0 < run0.len() && i1 < run1.len() {
117+
if run0[i0] != run1[i1] {
118+
return false;
119+
}
120+
121+
i0 += 1;
122+
i1 += 1;
123+
}
124+
125+
if i0 == run0.len() {
126+
i0 = 0;
127+
run_option0 = it0.next();
128+
}
129+
if i1 == run1.len() {
130+
i1 = 0;
131+
run_option1 = it1.next();
132+
}
133+
}
134+
135+
while let Some(run0) = run_option0 {
136+
if run0.len() == 0 {
137+
run_option0 = it0.next();
138+
} else {
139+
break;
140+
}
141+
}
142+
143+
while let Some(run1) = run_option1 {
144+
if run1.len() == 0 {
145+
run_option1 = it1.next();
146+
} else {
147+
break;
148+
}
149+
}
150+
151+
run_option0.is_none() && run_option1.is_none()
152+
}
153+
154+
#[cfg(test)]
155+
mod tests {
156+
use crate::*;
157+
use crate::xutils::{chunked_iter_equal, WhitespaceIter};
158+
159+
fn extract_string<'a>(line: &[u8], flags: u64, buffer: &'a mut Vec<u8>) -> &'a str {
160+
let it = WhitespaceIter::new(line, flags);
161+
buffer.clear();
162+
for run in it {
163+
#[cfg(test)]
164+
let _view = unsafe { std::str::from_utf8_unchecked(run) };
165+
buffer.extend_from_slice(run);
166+
}
167+
unsafe { std::str::from_utf8_unchecked(buffer.as_slice()) }
168+
}
169+
170+
fn get_str_it<'a>(slice: &'a [&'a str]) -> impl Iterator<Item = &'a [u8]> + 'a {
171+
slice.iter().map(|v| (*v).as_bytes())
172+
}
173+
174+
#[test]
175+
fn test_ignore_space() {
176+
let tv_individual = vec![
177+
("ab\r", "ab\r", XDF_IGNORE_CR_AT_EOL),
178+
("ab \r", "ab \r", XDF_IGNORE_CR_AT_EOL),
179+
("\r \t a \r", "\r \t a \r", XDF_IGNORE_CR_AT_EOL),
180+
("\r a \r", "\r a \r", XDF_IGNORE_CR_AT_EOL),
181+
("\r", "\r", XDF_IGNORE_CR_AT_EOL),
182+
("", "", XDF_IGNORE_CR_AT_EOL),
183+
("\r a \r", "\r a \r", XDF_IGNORE_CR_AT_EOL),
184+
185+
("\r \t a \n", "\r \t a \r\n", XDF_IGNORE_CR_AT_EOL),
186+
("\r a \n", "\r a \r\n", XDF_IGNORE_CR_AT_EOL),
187+
("\n", "\r\n", XDF_IGNORE_CR_AT_EOL),
188+
("\n", "\n", XDF_IGNORE_CR_AT_EOL),
189+
("\r a \n", "\r a \n", XDF_IGNORE_CR_AT_EOL),
190+
191+
("1\n", "1\r\n", XDF_IGNORE_CR_AT_EOL),
192+
("1", "1\r\n", XDF_IGNORE_WHITESPACE_CHANGE),
193+
194+
("\r \t a \r\n", "\r \t a \r\n", 0),
195+
("\r a \r\n", "\r a \r\n", 0),
196+
("\r\n", "\r\n", 0),
197+
("\n", "\n", 0),
198+
("\r a \n", "\r a \n", 0),
199+
(" \n", " \n", 0),
200+
("a \n", "a \n", 0),
201+
(" a \t asdf \t \r\n", " a \t asdf \t \r\n", 0),
202+
("\t a b \t \n", "\t a b \t \n", 0),
203+
(" a b \t \r\n", " a b \t \r\n", 0),
204+
("\t a \n", "\t a \n", 0),
205+
("\t\t\ta\t\n", "\t\t\ta\t\n", 0),
206+
("a\n", "a\n", 0),
207+
("\ta\n", "\ta\n", 0),
208+
209+
("a", "\r \t a \r\n", XDF_IGNORE_WHITESPACE),
210+
("a", "\r a \r\n", XDF_IGNORE_WHITESPACE),
211+
("", "\r\n", XDF_IGNORE_WHITESPACE),
212+
("", "\n", XDF_IGNORE_WHITESPACE),
213+
("a", "\r a \n", XDF_IGNORE_WHITESPACE),
214+
("", " \n", XDF_IGNORE_WHITESPACE),
215+
("a", "a \n", XDF_IGNORE_WHITESPACE),
216+
("aasdf", " a \t asdf \t \r\n", XDF_IGNORE_WHITESPACE),
217+
("ab", "\t a b \t \n", XDF_IGNORE_WHITESPACE),
218+
("ab", " a b \t \r\n", XDF_IGNORE_WHITESPACE),
219+
("a", "\t a \n", XDF_IGNORE_WHITESPACE),
220+
("a", "\t\t\ta\t\n", XDF_IGNORE_WHITESPACE),
221+
("a", "a\n", XDF_IGNORE_WHITESPACE),
222+
("a", "\ta\n", XDF_IGNORE_WHITESPACE),
223+
224+
("", " \n", XDF_IGNORE_WHITESPACE_AT_EOL),
225+
("a", "a \n", XDF_IGNORE_WHITESPACE_AT_EOL),
226+
(" a \t asdf", " a \t asdf \t \r\n", XDF_IGNORE_WHITESPACE_AT_EOL),
227+
("\t a b", "\t a b \t \n", XDF_IGNORE_WHITESPACE_AT_EOL),
228+
229+
(" a b", " a b \t \r\n", XDF_IGNORE_WHITESPACE_CHANGE),
230+
(" a", "\t a \n", XDF_IGNORE_WHITESPACE_CHANGE),
231+
(" a", "\t\t\ta\t\n", XDF_IGNORE_WHITESPACE_CHANGE),
232+
("a", "a\n", XDF_IGNORE_WHITESPACE_CHANGE),
233+
(" a", "\ta\n", XDF_IGNORE_WHITESPACE_CHANGE),
234+
235+
("ab", " a b \t \r\n", XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE),
236+
("a", "\t a \n", XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE),
237+
("a", "\t\t\ta\t\n", XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE),
238+
("a", "a\n", XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE),
239+
("a", "\ta\n", XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE),
240+
];
241+
242+
let mut buffer = Vec::<u8>::new();
243+
for (expected, input, flags) in tv_individual {
244+
let actual = extract_string(input.as_bytes(), flags, &mut buffer);
245+
assert_eq!(expected, actual, "input: {:?} flags: 0x{:x}", input, flags);
246+
}
247+
}
248+
249+
#[test]
250+
fn test_chunked_iter_equal() {
251+
let tv_str: Vec<(Vec<&str>, Vec<&str>)> = vec![
252+
/* equal cases */
253+
(vec!["", "", "abc"], vec!["", "abc"]),
254+
(vec!["c", "", "a"], vec!["c", "a"]),
255+
(vec!["a", "", "b", "", "c"], vec!["a", "b", "c"]),
256+
(vec!["", "", "a"], vec!["a"]),
257+
(vec!["", "a"], vec!["a"]),
258+
(vec![""], vec![]),
259+
(vec!["", ""], vec![""]),
260+
(vec!["a"], vec!["", "", "a"]),
261+
(vec!["a"], vec!["", "a"]),
262+
(vec![], vec![""]),
263+
(vec![""], vec!["", ""]),
264+
(vec!["hello ", "world"], vec!["hel", "lo wo", "rld"]),
265+
(vec!["hel", "lo wo", "rld"], vec!["hello ", "world"]),
266+
(vec!["hello world"], vec!["hello world"]),
267+
(vec!["abc", "def"], vec!["def", "abc"]),
268+
(vec![], vec![]),
269+
270+
/* different cases */
271+
(vec!["abc"], vec![]),
272+
(vec!["", "", ""], vec!["", "a"]),
273+
(vec!["", "a"], vec!["b", ""]),
274+
(vec!["abc"], vec!["abc", "de"]),
275+
(vec!["abc", "de"], vec!["abc"]),
276+
(vec![], vec!["a"]),
277+
(vec!["a"], vec![]),
278+
(vec!["abc", "kj"], vec!["abc", "de"]),
279+
];
280+
281+
for (lhs, rhs) in tv_str.iter() {
282+
let a: Vec<u8> = get_str_it(lhs).flatten().copied().collect();
283+
let b: Vec<u8> = get_str_it(rhs).flatten().copied().collect();
284+
let expected = a.as_slice() == b.as_slice();
285+
286+
let it0 = get_str_it(lhs);
287+
let it1 = get_str_it(rhs);
288+
let actual = chunked_iter_equal(it0, it1);
289+
assert_eq!(expected, actual);
290+
}
291+
}
292+
}

0 commit comments

Comments
 (0)