Skip to content

regex 1.0 #230

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Update Replacer trait for Unicode regexes.
This uses the new Replacer trait essentially as defined in the `bytes`
sub-module and described in #151.

Fixes #151
  • Loading branch information
BurntSushi committed Aug 5, 2016
commit 8b18b29eb2105b65663ba6973f4630cd3119bb62
142 changes: 127 additions & 15 deletions src/expand.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,50 @@ use std::str;

use memchr::memchr;

use bytes::Captures;
use re_bytes;
use re_unicode;

pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
pub fn expand_str(
caps: &re_unicode::Captures,
mut replacement: &str,
dst: &mut String,
) {
while !replacement.is_empty() {
match memchr(b'$', replacement.as_bytes()) {
None => break,
Some(i) => {
dst.push_str(&replacement[..i]);
replacement = &replacement[i..];
}
}
if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
dst.push_str("$");
replacement = &replacement[2..];
continue;
}
debug_assert!(!replacement.is_empty());
let cap_ref = match find_cap_ref(replacement) {
Some(cap_ref) => cap_ref,
None => {
dst.push_str("$");
replacement = &replacement[1..];
continue;
}
};
replacement = &replacement[cap_ref.end..];
match cap_ref.cap {
Ref::Number(i) => dst.push_str(caps.at(i).unwrap_or("")),
Ref::Named(name) => dst.push_str(caps.name(name).unwrap_or("")),
}
}
dst.push_str(replacement);
}

pub fn expand_bytes(
caps: &re_bytes::Captures,
mut replacement: &[u8],
dst: &mut Vec<u8>,
) {
while !replacement.is_empty() {
match memchr(b'$', replacement) {
None => break,
Expand All @@ -27,7 +68,7 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
continue;
}
};
replacement = cap_ref.rest;
replacement = &replacement[cap_ref.end..];
match cap_ref.cap {
Ref::Number(i) => dst.extend(caps.at(i).unwrap_or(b"")),
Ref::Named(name) => dst.extend(caps.name(name).unwrap_or(b"")),
Expand All @@ -36,56 +77,127 @@ pub fn expand(caps: &Captures, mut replacement: &[u8], dst: &mut Vec<u8>) {
dst.extend(replacement);
}

/// CaptureRef represents a reference to a capture group inside some text. The
/// reference is either a capture group name or a number.
///
/// It is also tagged with the position in the text immediately proceding the
/// capture reference.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
struct CaptureRef<'a> {
rest: &'a [u8],
cap: Ref<'a>,
end: usize,
}

/// A reference to a capture group in some text.
///
/// e.g., `$2`, `$foo`, `${foo}`.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
enum Ref<'a> {
Named(&'a str),
Number(usize),
}

fn find_cap_ref(mut replacement: &[u8]) -> Option<CaptureRef> {
if replacement.len() <= 1 || replacement[0] != b'$' {
impl<'a> From<&'a str> for Ref<'a> {
fn from(x: &'a str) -> Ref<'a> {
Ref::Named(x)
}
}

impl From<usize> for Ref<'static> {
fn from(x: usize) -> Ref<'static> {
Ref::Number(x)
}
}

/// Parses a possible reference to a capture group name in the given text,
/// starting at the beginning of `replacement`.
///
/// If no such valid reference could be found, None is returned.
fn find_cap_ref<T: ?Sized + AsRef<[u8]>>(
replacement: &T,
) -> Option<CaptureRef> {
let mut i = 0;
let rep: &[u8] = replacement.as_ref();
if rep.len() <= 1 || rep[0] != b'$' {
return None;
}
let mut brace = false;
replacement = &replacement[1..];
if replacement[0] == b'{' {
i += 1;
if rep[i] == b'{' {
brace = true;
replacement = &replacement[1..];
i += 1;
}
let mut cap_end = 0;
while replacement.get(cap_end).map_or(false, is_valid_cap_letter) {
let mut cap_end = i;
while rep.get(cap_end).map_or(false, is_valid_cap_letter) {
cap_end += 1;
}
if cap_end == 0 {
if cap_end == i {
return None;
}
// We just verified that the range 0..cap_end is valid ASCII, so it must
// therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
// check with either unsafe or by parsing the number straight from &[u8].
let cap = str::from_utf8(&replacement[..cap_end])
let cap = str::from_utf8(&rep[i..cap_end])
.ok().expect("valid UTF-8 capture name");
if brace {
if !replacement.get(cap_end).map_or(false, |&b| b == b'}') {
if !rep.get(cap_end).map_or(false, |&b| b == b'}') {
return None;
}
cap_end += 1;
}
Some(CaptureRef {
rest: &replacement[cap_end..],
cap: match cap.parse::<u32>() {
Ok(i) => Ref::Number(i as usize),
Err(_) => Ref::Named(cap),
},
end: cap_end,
})
}

/// Returns true if and only if the given byte is allowed in a capture name.
fn is_valid_cap_letter(b: &u8) -> bool {
match *b {
b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true,
_ => false,
}
}

#[cfg(test)]
mod tests {
use super::{CaptureRef, find_cap_ref};

macro_rules! find {
($name:ident, $text:expr) => {
#[test]
fn $name() {
assert_eq!(None, find_cap_ref($text));
}
};
($name:ident, $text:expr, $capref:expr) => {
#[test]
fn $name() {
assert_eq!(Some($capref), find_cap_ref($text));
}
};
}

macro_rules! c {
($name_or_number:expr, $pos:expr) => {
CaptureRef { cap: $name_or_number.into(), end: $pos }
};
}

find!(find_cap_ref1, "$foo", c!("foo", 4));
find!(find_cap_ref2, "${foo}", c!("foo", 6));
find!(find_cap_ref3, "$0", c!(0, 2));
find!(find_cap_ref4, "$5", c!(5, 2));
find!(find_cap_ref5, "$10", c!(10, 3));
find!(find_cap_ref6, "$42a", c!("42a", 4));
find!(find_cap_ref7, "${42}a", c!(42, 5));
find!(find_cap_ref8, "${42");
find!(find_cap_ref9, "${42 ");
find!(find_cap_ref10, " $0 ");
find!(find_cap_ref11, "$");
find!(find_cap_ref12, " ");
find!(find_cap_ref13, "");
}
23 changes: 21 additions & 2 deletions src/re_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use std::sync::Arc;
use memchr::memchr;

use exec::{Exec, ExecNoSync};
use expand::expand;
use expand::expand_bytes;
use error::Error;
use re_builder::bytes::RegexBuilder;
use re_trait::{self, RegularExpression, Slot};
Expand Down Expand Up @@ -375,6 +375,25 @@ impl Regex {
/// If no match is found, then a copy of the byte string is returned
/// unchanged.
///
/// # Replacement string syntax
///
/// All instances of `$name` in the replacement text is replaced with the
/// corresponding capture group `name`.
///
/// `name` may be an integer corresponding to the index of the
/// capture group (counted by order of opening parenthesis where `0` is the
/// entire match) or it can be a name (consisting of letters, digits or
/// underscores) corresponding to a named capture group.
///
/// If `name` isn't a valid capture group (whether the name doesn't exist
/// or isn't a valid index), then it is replaced with the empty string.
///
/// The longest possible name is used. e.g., `$1a` looks up the capture
/// group named `1a` and not the capture group at index `1`. To exert more
/// precise control over the name, use braces, e.g., `${1}a`.
///
/// To write a literal `$` use `$$`.
///
/// # Examples
///
/// Note that this function is polymorphic with respect to the replacement.
Expand Down Expand Up @@ -768,7 +787,7 @@ impl<'t> Captures<'t> {
///
/// To write a literal `$` use `$$`.
pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) {
expand(self, replacement, dst)
expand_bytes(self, replacement, dst)
}

/// Returns the number of captured groups.
Expand Down
Loading