Skip to content

regex 0.2 #310

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Dec 31, 2016
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
d44a9f9
Switch bytes::Regex to using Unicode mode by default.
BurntSushi May 2, 2016
ebd26e9
Update Replacer trait for Unicode regexes.
BurntSushi May 2, 2016
f98219b
Remove the is_empty method on Captures.
BurntSushi May 7, 2016
83fce85
Drop the PartialEq and Eq impls on Regex.
BurntSushi May 7, 2016
d12042b
Use correct lifetimes for SubCaptures and SubCapturesNamed types.
BurntSushi May 7, 2016
e1a94bb
Remove Regex::with_size_limit.
BurntSushi May 7, 2016
cfd887d
Remove free is_match function.
BurntSushi May 7, 2016
24f86b0
Rename RegexSplits to Splits.
BurntSushi May 7, 2016
a6722a3
Reorganize capture slot handling, but don't make any public API changes.
BurntSushi May 7, 2016
2632c2f
Rename many of the iterator types.
BurntSushi May 17, 2016
52165d6
Use `Cow` for replacements.
BurntSushi May 17, 2016
2805811
Update the Error type.
BurntSushi May 18, 2016
384e937
find/find_iter now return a Match instead of (usize, usize).
BurntSushi Aug 5, 2016
fab4069
Remove the submatch iterators.
BurntSushi Aug 5, 2016
1f7f5c9
Fix tests.
BurntSushi Aug 5, 2016
403b27a
Switch to more idiomatic builder definition.
BurntSushi Aug 21, 2016
3f1fde5
Rename iterator types to match `std` conventions.
BurntSushi Aug 21, 2016
8ee9262
Changed the name of quote to escape.
Nov 15, 2016
bc06024
Make ASCII classes consistent with other engines.
BurntSushi Dec 30, 2016
dd120a9
Require escaping of [, &, - and ~ in classes.
BurntSushi Dec 30, 2016
374f139
Add SubCaptureMatches iterator on Captures.
BurntSushi Dec 30, 2016
c4faddf
Remove custom extend_from_slice implementation.
BurntSushi Dec 30, 2016
66c6ddf
Fix performance bug with Match.
BurntSushi Dec 31, 2016
0c59d41
Add RegexSetBuilder.
BurntSushi Dec 31, 2016
63132b5
Documentation updates and clean ups.
BurntSushi Dec 31, 2016
f094d15
Update github links.
BurntSushi Dec 31, 2016
ac3ab6d
Bump versions everywhere and update CHANGELOG.
BurntSushi Dec 30, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Reorganize capture slot handling, but don't make any public API changes.
  • Loading branch information
BurntSushi committed Dec 30, 2016
commit a6722a32ecd2888f93e533abd56413a5493f7e75
10 changes: 5 additions & 5 deletions regex-capi/src/rure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ pub struct rure_match {
pub end: size_t,
}

pub struct Captures(Vec<Option<usize>>);
pub struct Captures(bytes::Locations);

pub struct Iter {
re: *const Regex,
Expand Down Expand Up @@ -404,7 +404,7 @@ ffi_fn! {
ffi_fn! {
fn rure_captures_new(re: *const Regex) -> *mut Captures {
let re = unsafe { &*re };
let captures = Captures(vec![None; 2 * re.captures_len()]);
let captures = Captures(re.locations());
Box::into_raw(Box::new(captures))
}
}
Expand All @@ -421,9 +421,9 @@ ffi_fn! {
i: size_t,
match_info: *mut rure_match,
) -> bool {
let captures = unsafe { &(*captures).0 };
match (captures[i * 2], captures[i * 2 + 1]) {
(Some(start), Some(end)) => {
let locs = unsafe { &(*captures).0 };
match locs.pos(i) {
Some((start, end)) => {
if !match_info.is_null() {
unsafe {
(*match_info).start = start;
Expand Down
9 changes: 5 additions & 4 deletions src/exec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ use prog::Program;
use re_builder::RegexOptions;
use re_bytes;
use re_set;
use re_trait::{RegularExpression, Slot};
use re_trait::{RegularExpression, Slot, Locations, as_slots};
use re_unicode;
use utf8::next_utf8;

Expand Down Expand Up @@ -343,11 +343,11 @@ impl<'c> RegularExpression for ExecNoSyncStr<'c> {
#[inline(always)] // reduces constant overhead
fn read_captures_at(
&self,
slots: &mut [Slot],
locs: &mut Locations,
text: &str,
start: usize,
) -> Option<(usize, usize)> {
self.0.read_captures_at(slots, text.as_bytes(), start)
self.0.read_captures_at(locs, text.as_bytes(), start)
}
}

Expand Down Expand Up @@ -512,10 +512,11 @@ impl<'c> RegularExpression for ExecNoSync<'c> {
/// locations of the overall match.
fn read_captures_at(
&self,
slots: &mut [Slot],
locs: &mut Locations,
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
let slots = as_slots(locs);
for slot in slots.iter_mut() {
*slot = None;
}
Expand Down
6 changes: 4 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -460,8 +460,9 @@ extern crate utf8_ranges;
pub use error::Error;
pub use re_builder::unicode::*;
pub use re_set::unicode::*;
pub use re_trait::{Locations, SubCapturesPos};
pub use re_unicode::{
Regex, Captures, SubCaptures, SubCapturesPos, SubCapturesNamed,
Regex, Captures, SubCaptures, SubCapturesNamed,
CaptureNames, FindCaptures, FindMatches,
Replacer, NoExpand, Splits, SplitsN,
quote,
Expand Down Expand Up @@ -554,8 +555,9 @@ performance on `&str`.
*/
pub mod bytes {
pub use re_builder::bytes::*;
pub use re_set::bytes::*;
pub use re_bytes::*;
pub use re_set::bytes::*;
pub use re_trait::{Locations, SubCapturesPos};
}

mod backtrack;
Expand Down
151 changes: 67 additions & 84 deletions src/re_bytes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use exec::{Exec, ExecNoSync};
use expand::expand_bytes;
use error::Error;
use re_builder::bytes::RegexBuilder;
use re_trait::{self, RegularExpression, Slot};
use re_trait::{self, RegularExpression, Locations, SubCapturesPos};

/// A compiled regular expression for matching arbitrary bytes.
///
Expand Down Expand Up @@ -71,6 +71,7 @@ impl FromStr for Regex {
}
}

/// Core regular expression methods.
impl Regex {
/// Compiles a regular expression. Once compiled, it can be used repeatedly
/// to search, split or replace text in a string.
Expand Down Expand Up @@ -102,17 +103,6 @@ impl Regex {
self.is_match_at(text, 0)
}

/// Returns the same as is_match, but starts the search at the given
/// offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
#[doc(hidden)]
pub fn is_match_at(&self, text: &[u8], start: usize) -> bool {
self.shortest_match_at(text, start).is_some()
}

/// Returns the start and end byte range of the leftmost-first match in
/// `text`. If no match exists, then `None` is returned.
///
Expand All @@ -137,21 +127,6 @@ impl Regex {
self.find_at(text, 0)
}

/// Returns the same as find, but starts the search at the given
/// offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
#[doc(hidden)]
pub fn find_at(
&self,
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
self.0.searcher().find_at(text, start)
}

/// Returns an iterator for each successive non-overlapping match in
/// `text`, returning the start and end byte indices with respect to
/// `text`.
Expand Down Expand Up @@ -243,30 +218,14 @@ impl Regex {
/// The `0`th capture group is always unnamed, so it must always be
/// accessed with `at(0)` or `[0]`.
pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> {
let mut slots = vec![None; 2 * self.captures_len()];
self.read_captures_at(&mut slots, text, 0).map(|_| Captures {
let mut locs = self.locations();
self.read_captures_at(&mut locs, text, 0).map(|_| Captures {
text: text,
slots: slots,
locs: locs,
named_groups: self.0.capture_name_idx().clone(),
})
}

/// Returns the same as captures, but starts the search at the given
/// offset and populates the capture locations given.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
#[doc(hidden)]
pub fn read_captures_at(
&self,
slots: &mut [Slot],
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
self.0.searcher().read_captures_at(slots, text, start)
}

/// Returns an iterator over all the non-overlapping capture groups matched
/// in `text`. This is operationally the same as `find_iter`, except it
/// yields information about submatches.
Expand Down Expand Up @@ -513,7 +472,10 @@ impl Regex {
extend_from_slice(&mut new, &text[last_match..]);
new
}
}

/// Advanced or "lower level" search methods.
impl Regex {
/// Returns the end location of a match in the text given.
///
/// This method may have the same performance characteristics as
Expand Down Expand Up @@ -554,6 +516,51 @@ impl Regex {
self.0.searcher().shortest_match_at(text, start)
}

/// Returns the same as is_match, but starts the search at the given
/// offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
#[doc(hidden)]
pub fn is_match_at(&self, text: &[u8], start: usize) -> bool {
self.shortest_match_at(text, start).is_some()
}

/// Returns the same as find, but starts the search at the given
/// offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
#[doc(hidden)]
pub fn find_at(
&self,
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
self.0.searcher().find_at(text, start)
}

/// Returns the same as captures, but starts the search at the given
/// offset and populates the capture locations given.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, the `\A` anchor can only
/// match when `start == 0`.
#[doc(hidden)]
pub fn read_captures_at(
&self,
locs: &mut Locations,
text: &[u8],
start: usize,
) -> Option<(usize, usize)> {
self.0.searcher().read_captures_at(locs, text, start)
}
}

/// Auxiliary methods.
impl Regex {
/// Returns the original string of this regex.
pub fn as_str(&self) -> &str {
&self.0.regex_strings()[0]
Expand All @@ -568,6 +575,13 @@ impl Regex {
pub fn captures_len(&self) -> usize {
self.0.capture_names().len()
}

/// Returns an empty set of locations that can be reused in multiple calls
/// to `read_captures`.
#[doc(hidden)]
pub fn locations(&self) -> Locations {
self.0.searcher().locations()
}
}

/// An iterator over all non-overlapping matches for a particular string.
Expand Down Expand Up @@ -601,9 +615,9 @@ impl<'r, 't> Iterator for FindCaptures<'r, 't> {
type Item = Captures<'t>;

fn next(&mut self) -> Option<Captures<'t>> {
self.0.next().map(|slots| Captures {
self.0.next().map(|locs| Captures {
text: self.0.text(),
slots: slots,
locs: locs,
named_groups: self.0.regex().capture_name_idx().clone(),
})
}
Expand Down Expand Up @@ -704,7 +718,7 @@ impl<'r> Iterator for CaptureNames<'r> {
/// `'t` is the lifetime of the matched text.
pub struct Captures<'t> {
text: &'t [u8],
slots: Vec<Option<usize>>,
locs: Locations,
named_groups: Arc<HashMap<String, usize>>,
}

Expand All @@ -714,11 +728,7 @@ impl<'t> Captures<'t> {
/// not match anything. The positions returned are *always* byte indices
/// with respect to the original byte string matched.
pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
let (s, e) = (i * 2, i * 2 + 1);
match (self.slots.get(s), self.slots.get(e)) {
(Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
_ => None,
}
self.locs.pos(i)
}

/// Returns the matched string for the capture group `i`. If `i` isn't
Expand Down Expand Up @@ -747,8 +757,8 @@ impl<'t> Captures<'t> {
/// Creates an iterator of all the capture group positions in order of
/// appearance in the regular expression. Positions are byte indices
/// in terms of the original string matched.
pub fn iter_pos<'c>(&'c self) -> SubCapturesPos<'c> {
SubCapturesPos { idx: 0, slots: &self.slots }
pub fn iter_pos(&self) -> SubCapturesPos {
self.locs.iter()
}

/// Creates an iterator of all named groups as an tuple with the group
Expand Down Expand Up @@ -787,7 +797,7 @@ impl<'t> Captures<'t> {
/// group that corresponds to the full match.
#[inline]
pub fn len(&self) -> usize {
self.slots.len() / 2
self.locs.len()
}
}

Expand Down Expand Up @@ -895,33 +905,6 @@ impl<'c, 't> Iterator for SubCaptures<'c, 't> {
}
}

/// An iterator over capture group positions for a particular match of a
/// regular expression.
///
/// Positions are byte indices in terms of the original byte string matched.
///
/// `'c` is the lifetime of the captures.
pub struct SubCapturesPos<'c> {
idx: usize,
slots: &'c [Option<usize>]
}

impl<'c> Iterator for SubCapturesPos<'c> {
type Item = Option<(usize, usize)>;

fn next(&mut self) -> Option<Option<(usize, usize)>> {
if self.idx >= self.slots.len() {
return None
}
let r = match (self.slots[self.idx], self.slots[self.idx + 1]) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
};
self.idx += 2;
Some(r)
}
}

/// An Iterator over named capture groups as a tuple with the group name and
/// the value.
///
Expand Down
11 changes: 8 additions & 3 deletions src/re_plugin.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use re_trait::{RegularExpression, Slot};
use re_trait::{RegularExpression, Slot, Locations, as_slots};

/// Plugin is the compiler plugin's data structure. It declare some static
/// data (like capture groups and the original regex string), but defines its
Expand Down Expand Up @@ -67,15 +67,20 @@ impl RegularExpression for Plugin {

fn find_at(&self, text: &str, start: usize) -> Option<(usize, usize)> {
let mut slots = [None, None];
self.read_captures_at(&mut slots, text, start)
(self.prog)(&mut slots, text, start);
match (slots[0], slots[1]) {
(Some(s), Some(e)) => Some((s, e)),
_ => None,
}
}

fn read_captures_at<'t>(
&self,
slots: &mut [Slot],
locs: &mut Locations,
text: &'t str,
start: usize,
) -> Option<(usize, usize)> {
let slots = as_slots(locs);
for slot in slots.iter_mut() {
*slot = None;
}
Expand Down
Loading