Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract internal Regex API for PCRE backend #12802

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 15 additions & 75 deletions src/regex.cr
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
require "./regex/*"
require "./regex/engine"
require "./regex/match_data"

# A `Regex` represents a regular expression, a pattern that describes the
# contents of strings. A `Regex` can determine whether or not a string matches
Expand Down Expand Up @@ -195,6 +196,8 @@ require "./regex/*"
# `Hash` of `String` => `Int32`, and therefore requires named capture groups to have
# unique names within a single `Regex`.
class Regex
include Regex::Engine

# List of metacharacters that need to be escaped.
#
# See `Regex.needs_escape?` and `Regex.escape`.
Expand Down Expand Up @@ -253,28 +256,8 @@ class Regex
# options = Regex::Options::IGNORE_CASE | Regex::Options::EXTENDED
# Regex.new("dog", options) # => /dog/ix
# ```
def initialize(source : String, @options : Options = Options::None)
# PCRE's pattern must have their null characters escaped
source = source.gsub('\u{0}', "\\0")
@source = source

@re = LibPCRE.compile(@source, (options | Options::UTF_8 | Options::NO_UTF8_CHECK | Options::DUPNAMES | Options::UCP), out errptr, out erroffset, nil)
raise ArgumentError.new("#{String.new(errptr)} at #{erroffset}") if @re.null?
@extra = LibPCRE.study(@re, LibPCRE::STUDY_JIT_COMPILE, out studyerrptr)
if @extra.null? && studyerrptr
{% unless flag?(:interpreted) %}
LibPCRE.free.call @re.as(Void*)
{% end %}
raise ArgumentError.new("#{String.new(studyerrptr)}")
end
LibPCRE.full_info(@re, nil, LibPCRE::INFO_CAPTURECOUNT, out @captures)
end

def finalize
LibPCRE.free_study @extra
{% unless flag?(:interpreted) %}
LibPCRE.free.call @re.as(Void*)
{% end %}
def self.new(source : String, options : Options = Options::None)
new(_source: source, _options: options)
end

# Determines Regex's source validity. If it is, `nil` is returned.
Expand All @@ -285,15 +268,7 @@ class Regex
# Regex.error?("(foo|bar") # => "missing ) at 8"
# ```
def self.error?(source) : String?
re = LibPCRE.compile(source, (Options::UTF_8 | Options::NO_UTF8_CHECK | Options::DUPNAMES), out errptr, out erroffset, nil)
if re
{% unless flag?(:interpreted) %}
LibPCRE.free.call re.as(Void*)
{% end %}
nil
else
"#{String.new(errptr)} at #{erroffset}"
end
Engine.error_impl(source)
end

# Returns `true` if *char* need to be escaped, `false` otherwise.
Expand Down Expand Up @@ -485,12 +460,10 @@ class Regex
# ```
def match(str, pos = 0, options = Regex::Options::None) : MatchData?
if byte_index = str.char_index_to_byte_index(pos)
match = match_at_byte_index(str, byte_index, options)
$~ = match_at_byte_index(str, byte_index, options)
else
match = nil
$~ = nil
end

$~ = match
end

# Match at byte index. Matches a regular expression against `String`
Expand All @@ -504,17 +477,11 @@ class Regex
# /(.)(.)/.match_at_byte_index("クリスタル", 3).try &.[2] # => "ス"
# ```
def match_at_byte_index(str, byte_index = 0, options = Regex::Options::None) : MatchData?
return ($~ = nil) if byte_index > str.bytesize

ovector_size = (@captures + 1) * 3
ovector = Pointer(Int32).malloc(ovector_size)
if internal_matches?(str, byte_index, options, ovector, ovector_size)
match = MatchData.new(self, @re, str, byte_index, ovector, @captures)
if byte_index > str.bytesize
$~ = nil
else
match = nil
$~ = match_impl(str, byte_index, options)
end

$~ = match
end

# Match at character index. It behaves like `#match`, however it returns `Bool` value.
Expand All @@ -540,14 +507,7 @@ class Regex
def matches_at_byte_index?(str, byte_index = 0, options = Regex::Options::None) : Bool
return false if byte_index > str.bytesize

internal_matches?(str, byte_index, options, nil, 0)
end

# Calls `pcre_exec` C function, and handles returning value.
private def internal_matches?(str, byte_index, options, ovector, ovector_size)
ret = LibPCRE.exec(@re, @extra, str, str.bytesize, byte_index, (options | Options::NO_UTF8_CHECK), ovector, ovector_size)
# TODO: when `ret < -1`, it means PCRE error. It should handle correctly.
ret >= 0
matches_impl(str, byte_index, options)
end

# Returns a `Hash` where the values are the names of capture groups and the
Expand All @@ -561,26 +521,7 @@ class Regex
# /(.)(?<foo>.)(.)(?<bar>.)(.)/.name_table # => {4 => "bar", 2 => "foo"}
# ```
def name_table : Hash(Int32, String)
LibPCRE.full_info(@re, @extra, LibPCRE::INFO_NAMECOUNT, out name_count)
LibPCRE.full_info(@re, @extra, LibPCRE::INFO_NAMEENTRYSIZE, out name_entry_size)
table_pointer = Pointer(UInt8).null
LibPCRE.full_info(@re, @extra, LibPCRE::INFO_NAMETABLE, pointerof(table_pointer).as(Pointer(Int32)))
name_table = table_pointer.to_slice(name_entry_size*name_count)

lookup = Hash(Int32, String).new

name_count.times do |i|
capture_offset = i * name_entry_size
capture_number = ((name_table[capture_offset].to_u16 << 8)).to_i32 | name_table[capture_offset + 1]

name_offset = capture_offset + 2
checked = name_table[name_offset, name_entry_size - 3]
name = String.new(checked.to_unsafe)

lookup[capture_number] = name
end

lookup
name_table_impl
end

# Returns the number of (named & non-named) capture groups.
Expand All @@ -592,8 +533,7 @@ class Regex
# /(.)|(.)/.capture_count # => 2
# ```
def capture_count : Int32
LibPCRE.full_info(@re, @extra, LibPCRE::INFO_CAPTURECOUNT, out capture_count)
capture_count
capture_count_impl
end

# Convert to `String` in subpattern format. Produces a `String` which can be
Expand Down
4 changes: 4 additions & 0 deletions src/regex/engine.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
require "./pcre"

# :nodoc:
alias Regex::Engine = PCRE
caspiano marked this conversation as resolved.
Show resolved Hide resolved
10 changes: 10 additions & 0 deletions src/regex/lib_pcre.cr
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@
lib LibPCRE
alias Int = LibC::Int

CASELESS = 0x00000001
MULTILINE = 0x00000002
DOTALL = 0x00000004
EXTENDED = 0x00000008
ANCHORED = 0x00000010
UTF8 = 0x00000800
NO_UTF8_CHECK = 0x00002000
DUPNAMES = 0x00080000
UCP = 0x20000000

type Pcre = Void*
type PcreExtra = Void*
fun compile = pcre_compile(pattern : UInt8*, options : Int, errptr : UInt8**, erroffset : Int*, tableptr : Void*) : Pcre
Expand Down
65 changes: 14 additions & 51 deletions src/regex/match_data.cr
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class Regex
# starting from `1`, so that `0` can be used to refer to the entire regular
# expression without needing to capture it explicitly.
struct MatchData
include Engine::MatchData

# Returns the original regular expression.
#
# ```
Expand All @@ -39,10 +41,6 @@ class Regex
# ```
getter string : String

# :nodoc:
def initialize(@regex : Regex, @code : LibPCRE::Pcre, @string : String, @pos : Int32, @ovector : Int32*, @group_size : Int32)
end

# Returns the number of elements in this match object.
#
# ```
Expand Down Expand Up @@ -109,10 +107,7 @@ class Regex
# ```
def byte_begin(n = 0) : Int32
check_index_out_of_bounds n
n += size if n < 0
value = @ovector[n * 2]
raise_capture_group_was_not_matched(n) if value < 0
value
byte_range(n) { |normalized_n| raise_capture_group_was_not_matched(normalized_n) }.begin
end

# Returns the position of the next byte after the match.
Expand All @@ -132,10 +127,7 @@ class Regex
# ```
def byte_end(n = 0) : Int32
check_index_out_of_bounds n
n += size if n < 0
value = @ovector[n * 2 + 1]
raise_capture_group_was_not_matched(n) if value < 0
value
byte_range(n) { |normalized_n| raise_capture_group_was_not_matched(normalized_n) }.end
end

# Returns the match of the *n*th capture group, or `nil` if there isn't
Expand All @@ -151,11 +143,8 @@ class Regex
def []?(n : Int) : String?
return unless valid_group?(n)

n += size if n < 0
start = @ovector[n * 2]
finish = @ovector[n * 2 + 1]
return if start < 0
@string.byte_slice(start, finish - start)
range = byte_range(n) { return nil }
@string.byte_slice(range.begin, range.end - range.begin)
end

# Returns the match of the *n*th capture group, or raises an `IndexError`
Expand All @@ -167,11 +156,9 @@ class Regex
# ```
def [](n : Int) : String
check_index_out_of_bounds n
n += size if n < 0

value = self[n]?
raise_capture_group_was_not_matched n if value.nil?
value
range = byte_range(n) { |normalized_n| raise_capture_group_was_not_matched(normalized_n) }
@string.byte_slice(range.begin, range.end - range.begin)
end

# Returns the match of the capture group named by *group_name*, or
Expand All @@ -189,16 +176,7 @@ class Regex
# "Crystal".match(/(?<ok>Cr).*(?<ok>al)/).not_nil!["ok"]? # => "al"
# ```
def []?(group_name : String) : String?
max_start = -1
match = nil
named_capture_number(group_name) do |n|
start = @ovector[n * 2]
if start > max_start
max_start = start
match = self[n]?
end
end
match
fetch_impl(group_name) { nil }
end

# Returns the match of the capture group named by *group_name*, or
Expand All @@ -216,14 +194,13 @@ class Regex
# "Crystal".match(/(?<ok>Cr).*(?<ok>al)/).not_nil!["ok"] # => "al"
# ```
def [](group_name : String) : String
match = self[group_name]?
unless match
named_capture_number(group_name) do
fetch_impl(group_name) { |exists|
if exists
raise KeyError.new("Capture group '#{group_name}' was not matched")
else
raise KeyError.new("Capture group '#{group_name}' does not exist")
end
raise KeyError.new("Capture group '#{group_name}' does not exist")
end
match
}
end

# Returns all matches that are within the given range.
Expand All @@ -249,20 +226,6 @@ class Regex
Array(String).new(count) { |i| self[start + i] }
end

private def named_capture_number(group_name)
name_entry_size = LibPCRE.get_stringtable_entries(@code, group_name, out first, out last)
return if name_entry_size < 0

while first <= last
capture_number = (first[0].to_u16 << 8) | first[1].to_u16
yield capture_number

first += name_entry_size
end

nil
end

# Returns the part of the original string before the match. If the match
# starts at the start of the string, returns the empty string.
#
Expand Down
Loading