Update Replacer trait for Unicode regexes.

This uses the new Replacer trait essentially as defined in the `bytes` sub-module and described in #151. Fixes #151
rust-lang · BurntSushi · May 2, 2016 · May 2, 2016 · May 7, 2016 · May 7, 2016
commit 8b18b29eb2105b65663ba6973f4630cd3119bb62
diff --git a/src/expand.rs b/src/expand.rs
@@ -2,9 +2,50 @@ use std::str;
 
 use memchr::memchr;
 
-use bytes::Captures;
+use re_bytes;
+use re_unicode;
 
-pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
+pub fn expand_str(
+    caps: &re_unicode::Captures,
+    mut replacement: &str,
+    dst: &mut String,
+) {
+    while !replacement.is_empty() {
+        match memchr(b'$', replacement.as_bytes()) {
+            None => break,
+            Some(i) => {
+                dst.push_str(&replacement[..i]);
+                replacement = &replacement[i..];
+            }
+        }
+        if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
+            dst.push_str("$");
+            replacement = &replacement[2..];
+            continue;
+        }
+        debug_assert!(!replacement.is_empty());
+        let cap_ref = match find_cap_ref(replacement) {
+            Some(cap_ref) => cap_ref,
+            None => {
+                dst.push_str("$");
+                replacement = &replacement[1..];
+                continue;
+            }
+        };
+        replacement = &replacement[cap_ref.end..];
+        match cap_ref.cap {
+            Ref::Number(i) => dst.push_str(caps.at(i).unwrap_or("")),
+            Ref::Named(name) => dst.push_str(caps.name(name).unwrap_or("")),
+        }
+    }
+    dst.push_str(replacement);
+}
+
+pub fn expand_bytes(
+    caps: &re_bytes::Captures,
+    mut replacement: &[u8],
+    dst: &mut Vec<u8>,
+) {
     while !replacement.is_empty() {
         match memchr(b'$', replacement) {
             None => break,
@@ -27,7 +68,7 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
                 continue;
             }
         };
-        replacement = cap_ref.rest;
+        replacement = &replacement[cap_ref.end..];
         match cap_ref.cap {
             Ref::Number(i) => dst.extend(caps.at(i).unwrap_or(b"")),
             Ref::Named(name) => dst.extend(caps.name(name).unwrap_or(b"")),
@@ -36,56 +77,127 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
     dst.extend(replacement);
 }
 
+/// CaptureRef represents a reference to a capture group inside some text. The
+/// reference is either a capture group name or a number.
+///
+/// It is also tagged with the position in the text immediately proceding the
+/// capture reference.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
 struct CaptureRef<'a> {
-    rest: &'a [u8],
     cap: Ref<'a>,
+    end: usize,
 }
 
+/// A reference to a capture group in some text.
+///
+/// e.g., `$2`, `$foo`, `${foo}`.
+#[derive(Clone, Copy, Debug, Eq, PartialEq)]
 enum Ref<'a> {
     Named(&'a str),
     Number(usize),
 }
 
-fn find_cap_ref(mut replacement: &[u8]) -> Option<CaptureRef> {
-    if replacement.len() <= 1 || replacement[0] != b'$' {
+impl<'a> From<&'a str> for Ref<'a> {
+    fn from(x: &'a str) -> Ref<'a> {
+        Ref::Named(x)
+    }
+}
+
+impl From<usize> for Ref<'static> {
+    fn from(x: usize) -> Ref<'static> {
+        Ref::Number(x)
+    }
+}
+
+/// Parses a possible reference to a capture group name in the given text,
+/// starting at the beginning of `replacement`.
+///
+/// If no such valid reference could be found, None is returned.
+fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
+    replacement: &T,
+) -> Option<CaptureRef> {
+    let mut i = 0;
+    let rep: &[u8] = replacement.as_ref();
+    if rep.len() <= 1 || rep[0] != b'$' {
         return None;
     }
     let mut brace = false;
-    replacement = &replacement[1..];
-    if replacement[0] == b'{' {
+    i += 1;
+    if rep[i] == b'{' {
         brace = true;
-        replacement = &replacement[1..];
+        i += 1;
     }
-    let mut cap_end = 0;
-    while replacement.get(cap_end).map_or(false, is_valid_cap_letter) {
+    let mut cap_end = i;
+    while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
         cap_end += 1;
     }
-    if cap_end == 0 {
+    if cap_end == i {
         return None;
     }
     // We just verified that the range 0..cap_end is valid ASCII, so it must
     // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
     // check with either unsafe or by parsing the number straight from &[u8].
-    let cap = str::from_utf8(&replacement[..cap_end])
+    let cap = str::from_utf8(&rep[i..cap_end])
                   .ok().expect("valid UTF-8 capture name");
     if brace {
-        if !replacement.get(cap_end).map_or(false, |&b| b == b'}') {
+        if !rep.get(cap_end).map_or(false, |&b| b == b'}') {
             return None;
         }
         cap_end += 1;
     }
     Some(CaptureRef {
-        rest: &replacement[cap_end..],
         cap: match cap.parse::<u32>() {
             Ok(i) => Ref::Number(i as usize),
             Err(_) => Ref::Named(cap),
         },
+        end: cap_end,
     })
 }
 
+/// Returns true if and only if the given byte is allowed in a capture name.
 fn is_valid_cap_letter(b: &u8) -> bool {
     match *b {
         b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true,
         _ => false,
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::{CaptureRef, find_cap_ref};
+
+    macro_rules! find {
+        ($name:ident, $text:expr) => {
+            #[test]
+            fn $name() {
+                assert_eq!(None, find_cap_ref($text));
+            }
+        };
+        ($name:ident, $text:expr, $capref:expr) => {
+            #[test]
+            fn $name() {
+                assert_eq!(Some($capref), find_cap_ref($text));
+            }
+        };
+    }
+
+    macro_rules! c {
+        ($name_or_number:expr, $pos:expr) => {
+            CaptureRef { cap: $name_or_number.into(), end: $pos }
+        };
+    }
+
+    find!(find_cap_ref1, "$foo", c!("foo", 4));
+    find!(find_cap_ref2, "${foo}", c!("foo", 6));
+    find!(find_cap_ref3, "$0", c!(0, 2));
+    find!(find_cap_ref4, "$5", c!(5, 2));
+    find!(find_cap_ref5, "$10", c!(10, 3));
+    find!(find_cap_ref6, "$42a", c!("42a", 4));
+    find!(find_cap_ref7, "${42}a", c!(42, 5));
+    find!(find_cap_ref8, "${42");
+    find!(find_cap_ref9, "${42 ");
+    find!(find_cap_ref10, " $0 ");
+    find!(find_cap_ref11, "$");
+    find!(find_cap_ref12, " ");
+    find!(find_cap_ref13, "");
+}
diff --git a/src/re_bytes.rs b/src/re_bytes.rs
@@ -19,7 +19,7 @@ use std::sync::Arc;
 use memchr::memchr;
 
 use exec::{Exec, ExecNoSync};
-use expand::expand;
+use expand::expand_bytes;
 use error::Error;
 use re_builder::bytes::RegexBuilder;
 use re_trait::{self, RegularExpression, Slot};
@@ -375,6 +375,25 @@ impl Regex {
     /// If no match is found, then a copy of the byte string is returned
     /// unchanged.
     ///
+    /// # Replacement string syntax
+    ///
+    /// All instances of `$name` in the replacement text is replaced with the
+    /// corresponding capture group `name`.
+    ///
+    /// `name` may be an integer corresponding to the index of the
+    /// capture group (counted by order of opening parenthesis where `0` is the
+    /// entire match) or it can be a name (consisting of letters, digits or
+    /// underscores) corresponding to a named capture group.
+    ///
+    /// If `name` isn't a valid capture group (whether the name doesn't exist
+    /// or isn't a valid index), then it is replaced with the empty string.
+    ///
+    /// The longest possible name is used. e.g., `$1a` looks up the capture
+    /// group named `1a` and not the capture group at index `1`. To exert more
+    /// precise control over the name, use braces, e.g., `${1}a`.
+    ///
+    /// To write a literal `$` use `$$`.
+    ///
     /// # Examples
     ///
     /// Note that this function is polymorphic with respect to the replacement.
@@ -768,7 +787,7 @@ impl<'t> Captures<'t> {
     ///
     /// To write a literal `$` use `$$`.
     pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
-        expand(self, replacement, dst)
+        expand_bytes(self, replacement, dst)
     }
 
     /// Returns the number of captured groups.