xdiff: implement a white space iterator in Rust

ezekielnewren · gitster · commit 4e388da5fa09 · 2025-08-17T17:04:10.000-07:00
Xdiff has traditionally implemented the logic for iterating over
whitespace in every location that needed to do so. Create a consolidated
iterator in Rust that we can call from each location. Write Rust unit
tests to ensure the correctness of the Rust whitespace iterator and the
chunked_iter_equal() function.

Signed-off-by: Ezekiel Newren &lt;ezekielnewren@gmail.com&gt;
Signed-off-by: Junio C Hamano &lt;gitster@pobox.com&gt;
diff --git a/rust/xdiff/src/lib.rs b/rust/xdiff/src/lib.rs
@@ -1,3 +1,13 @@
+pub mod xutils;
+
+pub const XDF_IGNORE_WHITESPACE: u64 = 1 << 1;
+pub const XDF_IGNORE_WHITESPACE_CHANGE: u64 = 1 << 2;
+pub const XDF_IGNORE_WHITESPACE_AT_EOL: u64 = 1 << 3;
+pub const XDF_IGNORE_CR_AT_EOL: u64 = 1 << 4;
+pub const XDF_WHITESPACE_FLAGS: u64 = XDF_IGNORE_WHITESPACE |
+    XDF_IGNORE_WHITESPACE_CHANGE |
+    XDF_IGNORE_WHITESPACE_AT_EOL |
+    XDF_IGNORE_CR_AT_EOL;
 
 
 #[no_mangle]
diff --git a/rust/xdiff/src/xutils.rs b/rust/xdiff/src/xutils.rs
@@ -0,0 +1,292 @@
+use crate::*;
+
+pub(crate) fn xdl_isspace(v: u8) -> bool {
+    match v {
+        b'\t' | b'\n' | b'\r' | b' ' => true,
+        _ => false,
+    }
+}
+
+pub struct WhitespaceIter<'a> {
+    line: &'a [u8],
+    index: usize,
+    flags: u64,
+}
+
+
+impl<'a> WhitespaceIter<'a> {
+    pub fn new(line: &'a [u8], flags: u64) -> Self {
+        Self {
+            line,
+            index: 0,
+            flags,
+        }
+    }
+}
+
+impl<'a> Iterator for WhitespaceIter<'a> {
+    type Item = &'a [u8];
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.index >= self.line.len() {
+            return None;
+        }
+
+        loop {
+            let start = self.index;
+            if self.index == self.line.len() {
+                return None;
+            }
+
+            /* return contiguous run of not space bytes */
+            while self.index < self.line.len() {
+                if xdl_isspace(self.line[self.index]) {
+                    break;
+                }
+                self.index += 1;
+            }
+            if self.index > start {
+                return Some(&self.line[start..self.index]);
+            }
+            /* the current byte had better be a space */
+            if !xdl_isspace(self.line[self.index]) {
+                panic!("xdl_line_iter_next xdl_isspace() is false")
+            }
+
+            while self.index < self.line.len() && xdl_isspace(self.line[self.index]) {
+                self.index += 1;
+            }
+
+
+            if self.index <= start {
+                panic!("xdl_isspace() cannot simultaneously be true and false");
+            }
+
+            if (self.flags & XDF_IGNORE_WHITESPACE_AT_EOL) != 0
+                && self.index == self.line.len()
+            {
+                return None;
+            }
+            if (self.flags & XDF_IGNORE_WHITESPACE) != 0 {
+                continue;
+            }
+            if (self.flags & XDF_IGNORE_WHITESPACE_CHANGE) != 0 {
+                if self.index == self.line.len() {
+                    continue;
+                }
+                return Some(" ".as_bytes());
+            }
+            if (self.flags & XDF_IGNORE_CR_AT_EOL) != 0 {
+                if start < self.line.len() && self.index == self.line.len() {
+                    let mut end = self.line.len();
+                    if end > 0 && self.line[end - 1] == b'\n' {
+                        if end - start == 1 {
+                            return Some(&self.line[start..end]);
+                        } else {
+                            end -= 1;
+                        }
+                        if end > 0 && self.line[end - 1] == b'\r' {
+                            self.index = end;
+                            end -= 1;
+                            if end - start == 0 {
+                                continue;
+                            }
+                            return Some(&self.line[start..end]);
+                        }
+                    }
+                }
+            }
+            return Some(&self.line[start..self.index]);
+        }
+    }
+}
+
+pub fn chunked_iter_equal<'a, T, IT0, IT1>(mut it0: IT0, mut it1: IT1) -> bool
+where
+    T: Eq + 'a,
+    IT0: Iterator<Item = &'a [T]>,
+    IT1: Iterator<Item = &'a [T]>,
+{
+    let mut run_option0: Option<&[T]> = it0.next();
+    let mut run_option1: Option<&[T]> = it1.next();
+    let mut i0 = 0;
+    let mut i1 = 0;
+
+    while let (Some(run0), Some(run1)) = (run_option0, run_option1) {
+        while i0 < run0.len() && i1 < run1.len() {
+            if run0[i0] != run1[i1] {
+                return false;
+            }
+
+            i0 += 1;
+            i1 += 1;
+        }
+
+        if i0 == run0.len() {
+            i0 = 0;
+            run_option0 = it0.next();
+        }
+        if i1 == run1.len() {
+            i1 = 0;
+            run_option1 = it1.next();
+        }
+    }
+
+    while let Some(run0) = run_option0 {
+        if run0.len() == 0 {
+            run_option0 = it0.next();
+        } else {
+            break;
+        }
+    }
+
+    while let Some(run1) = run_option1 {
+        if run1.len() == 0 {
+            run_option1 = it1.next();
+        } else {
+            break;
+        }
+    }
+
+    run_option0.is_none() && run_option1.is_none()
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::*;
+    use crate::xutils::{chunked_iter_equal, WhitespaceIter};
+
+    fn extract_string<'a>(line: &[u8], flags: u64, buffer: &'a mut Vec<u8>) -> &'a str {
+        let it = WhitespaceIter::new(line, flags);
+        buffer.clear();
+        for run in it {
+            #[cfg(test)]
+            let _view = unsafe { std::str::from_utf8_unchecked(run) };
+            buffer.extend_from_slice(run);
+        }
+        unsafe { std::str::from_utf8_unchecked(buffer.as_slice()) }
+    }
+
+    fn get_str_it<'a>(slice: &'a [&'a str]) -> impl Iterator<Item = &'a [u8]> + 'a {
+        slice.iter().map(|v| (*v).as_bytes())
+    }
+
+    #[test]
+    fn test_ignore_space() {
+        let tv_individual = vec![
+            ("ab\r", "ab\r", XDF_IGNORE_CR_AT_EOL),
+            ("ab \r", "ab \r", XDF_IGNORE_CR_AT_EOL),
+            ("\r \t a \r", "\r \t a \r", XDF_IGNORE_CR_AT_EOL),
+            ("\r a \r", "\r a \r", XDF_IGNORE_CR_AT_EOL),
+            ("\r", "\r", XDF_IGNORE_CR_AT_EOL),
+            ("", "", XDF_IGNORE_CR_AT_EOL),
+            ("\r a \r", "\r a \r", XDF_IGNORE_CR_AT_EOL),
+
+            ("\r \t a \n", "\r \t a \r\n", XDF_IGNORE_CR_AT_EOL),
+            ("\r a \n", "\r a \r\n", XDF_IGNORE_CR_AT_EOL),
+            ("\n", "\r\n", XDF_IGNORE_CR_AT_EOL),
+            ("\n", "\n", XDF_IGNORE_CR_AT_EOL),
+            ("\r a \n", "\r a \n", XDF_IGNORE_CR_AT_EOL),
+
+            ("1\n", "1\r\n", XDF_IGNORE_CR_AT_EOL),
+            ("1", "1\r\n", XDF_IGNORE_WHITESPACE_CHANGE),
+
+            ("\r \t a \r\n", "\r \t a \r\n", 0),
+            ("\r a \r\n", "\r a \r\n", 0),
+            ("\r\n", "\r\n", 0),
+            ("\n", "\n", 0),
+            ("\r a \n", "\r a \n", 0),
+            ("     \n", "     \n", 0),
+            ("a     \n", "a     \n", 0),
+            ("  a  \t  asdf  \t \r\n", "  a  \t  asdf  \t \r\n", 0),
+            ("\t a  b  \t \n", "\t a  b  \t \n", 0),
+            ("  a b \t \r\n", "  a b \t \r\n", 0),
+            ("\t  a \n", "\t  a \n", 0),
+            ("\t\t\ta\t\n", "\t\t\ta\t\n", 0),
+            ("a\n", "a\n", 0),
+            ("\ta\n", "\ta\n", 0),
+
+            ("a", "\r \t a \r\n", XDF_IGNORE_WHITESPACE),
+            ("a", "\r a \r\n", XDF_IGNORE_WHITESPACE),
+            ("", "\r\n", XDF_IGNORE_WHITESPACE),
+            ("", "\n", XDF_IGNORE_WHITESPACE),
+            ("a", "\r a \n", XDF_IGNORE_WHITESPACE),
+            ("", "     \n", XDF_IGNORE_WHITESPACE),
+            ("a", "a     \n", XDF_IGNORE_WHITESPACE),
+            ("aasdf", "  a  \t  asdf  \t \r\n", XDF_IGNORE_WHITESPACE),
+            ("ab", "\t a  b  \t \n", XDF_IGNORE_WHITESPACE),
+            ("ab", "  a b \t \r\n", XDF_IGNORE_WHITESPACE),
+            ("a", "\t  a \n", XDF_IGNORE_WHITESPACE),
+            ("a", "\t\t\ta\t\n", XDF_IGNORE_WHITESPACE),
+            ("a", "a\n", XDF_IGNORE_WHITESPACE),
+            ("a", "\ta\n", XDF_IGNORE_WHITESPACE),
+
+            ("", "     \n", XDF_IGNORE_WHITESPACE_AT_EOL),
+            ("a", "a     \n", XDF_IGNORE_WHITESPACE_AT_EOL),
+            ("  a  \t  asdf", "  a  \t  asdf  \t \r\n", XDF_IGNORE_WHITESPACE_AT_EOL),
+            ("\t a  b", "\t a  b  \t \n", XDF_IGNORE_WHITESPACE_AT_EOL),
+
+            (" a b", "  a b \t \r\n", XDF_IGNORE_WHITESPACE_CHANGE),
+            (" a", "\t  a \n", XDF_IGNORE_WHITESPACE_CHANGE),
+            (" a", "\t\t\ta\t\n", XDF_IGNORE_WHITESPACE_CHANGE),
+            ("a", "a\n", XDF_IGNORE_WHITESPACE_CHANGE),
+            (" a", "\ta\n", XDF_IGNORE_WHITESPACE_CHANGE),
+
+            ("ab", "  a b \t \r\n", XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE),
+            ("a", "\t  a \n", XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE),
+            ("a", "\t\t\ta\t\n", XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE),
+            ("a", "a\n", XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE),
+            ("a", "\ta\n", XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE),
+        ];
+
+        let mut buffer = Vec::<u8>::new();
+        for (expected, input, flags) in tv_individual {
+            let actual = extract_string(input.as_bytes(), flags, &mut buffer);
+            assert_eq!(expected, actual, "input: {:?} flags: 0x{:x}", input, flags);
+        }
+    }
+
+    #[test]
+    fn test_chunked_iter_equal() {
+        let tv_str: Vec<(Vec<&str>, Vec<&str>)> = vec![
+            /* equal cases */
+            (vec!["", "", "abc"],         vec!["", "abc"]),
+            (vec!["c", "", "a"],          vec!["c", "a"]),
+            (vec!["a", "", "b", "", "c"], vec!["a", "b", "c"]),
+            (vec!["", "", "a"],           vec!["a"]),
+            (vec!["", "a"],               vec!["a"]),
+            (vec![""],                    vec![]),
+            (vec!["", ""],                vec![""]),
+            (vec!["a"],                   vec!["", "", "a"]),
+            (vec!["a"],                   vec!["", "a"]),
+            (vec![],                      vec![""]),
+            (vec![""],                    vec!["", ""]),
+            (vec!["hello ", "world"],     vec!["hel", "lo wo", "rld"]),
+            (vec!["hel", "lo wo", "rld"], vec!["hello ", "world"]),
+            (vec!["hello world"],         vec!["hello world"]),
+            (vec!["abc", "def"],          vec!["def", "abc"]),
+            (vec![],                      vec![]),
+
+            /* different cases */
+            (vec!["abc"],       vec![]),
+            (vec!["", "", ""],  vec!["", "a"]),
+            (vec!["", "a"],     vec!["b", ""]),
+            (vec!["abc"],       vec!["abc", "de"]),
+            (vec!["abc", "de"], vec!["abc"]),
+            (vec![],            vec!["a"]),
+            (vec!["a"],         vec![]),
+            (vec!["abc", "kj"], vec!["abc", "de"]),
+        ];
+
+        for (lhs, rhs) in tv_str.iter() {
+            let a: Vec<u8> = get_str_it(lhs).flatten().copied().collect();
+            let b: Vec<u8> = get_str_it(rhs).flatten().copied().collect();
+            let expected = a.as_slice() == b.as_slice();
+
+            let it0 = get_str_it(lhs);
+            let it1 = get_str_it(rhs);
+            let actual = chunked_iter_equal(it0, it1);
+            assert_eq!(expected, actual);
+        }
+    }
+}