Skip to content

[5.7] Recover from parser errors #519

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 30, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Introduce AST.Atom.Number
This stores both a source location, and has the
ability to be `nil`, which is necessary to enable
parser recovery in cases where we expect a number
but parse something that e.g overflows.
  • Loading branch information
hamishknight committed Jun 30, 2022
commit baa94386b549fc1b958c7a719d109cda4894172f
25 changes: 19 additions & 6 deletions Sources/_RegexParser/Regex/AST/AST.swift
Original file line number Diff line number Diff line change
Expand Up @@ -265,33 +265,46 @@ extension AST {
public enum Kind: Hashable {
// \n \gn \g{n} \g<n> \g'n' (?n) (?(n)...
// Oniguruma: \k<n>, \k'n'
case absolute(Int)
case absolute(AST.Atom.Number)

// \g{-n} \g<+n> \g'+n' \g<-n> \g'-n' (?+n) (?-n)
// (?(+n)... (?(-n)...
// Oniguruma: \k<-n> \k<+n> \k'-n' \k'+n'
case relative(Int)
case relative(AST.Atom.Number)

// \k<name> \k'name' \g{name} \k{name} (?P=name)
// \g<name> \g'name' (?&name) (?P>name)
// (?(<name>)... (?('name')... (?(name)...
case named(String)

/// (?R), (?(R)..., which are equivalent to (?0), (?(0)...
static var recurseWholePattern: Kind { .absolute(0) }
static func recurseWholePattern(_ loc: SourceLocation) -> Kind {
.absolute(.init(0, at: loc))
}

/// Whether this is a reference that recurses the whole pattern, rather
/// than a group.
public var recursesWholePattern: Bool {
switch self {
case .absolute(let a):
return a.value == 0
default:
return false
}
}
}
public var kind: Kind

/// An additional specifier supported by Oniguruma that specifies what
/// recursion level the group being referenced belongs to.
public var recursionLevel: Located<Int>?
public var recursionLevel: AST.Atom.Number?

/// The location of the inner numeric or textual reference, e.g the location
/// of '-2' in '\g{-2}'. Note this includes the recursion level for e.g
/// '\k<a+2>'.
public var innerLoc: SourceLocation

public init(_ kind: Kind, recursionLevel: Located<Int>? = nil,
public init(_ kind: Kind, recursionLevel: AST.Atom.Number? = nil,
innerLoc: SourceLocation) {
self.kind = kind
self.recursionLevel = recursionLevel
Expand All @@ -300,7 +313,7 @@ extension AST {

/// Whether this is a reference that recurses the whole pattern, rather than
/// a group.
public var recursesWholePattern: Bool { kind == .recurseWholePattern }
public var recursesWholePattern: Bool { kind.recursesWholePattern }
}

/// A set of global matching options in a regular expression literal.
Expand Down
14 changes: 13 additions & 1 deletion Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,18 @@ extension AST.Atom {
}

extension AST.Atom {
public struct Number: Hashable {
/// The value, which may be `nil` in an invalid AST, e.g the parser expected
/// a number at a given location, or the parsed number overflowed.
public var value: Int?
public var location: SourceLocation

public init(_ value: Int?, at location: SourceLocation) {
self.value = value
self.location = location
}
}

public struct Scalar: Hashable {
public var value: UnicodeScalar
public var location: SourceLocation
Expand Down Expand Up @@ -558,7 +570,7 @@ extension AST.Atom {
/// A PCRE callout written `(?C...)`
public struct PCRE: Hashable {
public enum Argument: Hashable {
case number(Int)
case number(AST.Atom.Number)
case string(String)
}
public var arg: AST.Located<Argument>
Expand Down
8 changes: 5 additions & 3 deletions Sources/_RegexParser/Regex/AST/Conditional.swift
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,13 @@ extension AST.Conditional {

extension AST.Conditional.Condition {
public struct PCREVersionNumber: Hashable {
public var major: Int
public var minor: Int
public var major: AST.Atom.Number
public var minor: AST.Atom.Number
public var location: SourceLocation

public init(major: Int, minor: Int, _ location: SourceLocation) {
public init(
major: AST.Atom.Number, minor: AST.Atom.Number, _ location: SourceLocation
) {
self.major = major
self.minor = minor
self.location = location
Expand Down
6 changes: 3 additions & 3 deletions Sources/_RegexParser/Regex/AST/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -175,13 +175,13 @@ extension AST {
}
public enum Kind: Hashable {
/// (*LIMIT_DEPTH=d)
case limitDepth(Located<Int>)
case limitDepth(AST.Atom.Number)

/// (*LIMIT_HEAP=d)
case limitHeap(Located<Int>)
case limitHeap(AST.Atom.Number)

/// (*LIMIT_MATCH=d)
case limitMatch(Located<Int>)
case limitMatch(AST.Atom.Number)

/// (*NOTEMPTY)
case notEmpty
Expand Down
16 changes: 8 additions & 8 deletions Sources/_RegexParser/Regex/AST/Quantification.swift
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ extension AST {
}

public enum Amount: Hashable {
case zeroOrMore // *
case oneOrMore // +
case zeroOrOne // ?
case exactly(Located<Int>) // {n}
case nOrMore(Located<Int>) // {n,}
case upToN(Located<Int>) // {,n}
case range(Located<Int>, Located<Int>) // {n,m}
case zeroOrMore // *
case oneOrMore // +
case zeroOrOne // ?
case exactly(AST.Atom.Number) // {n}
case nOrMore(AST.Atom.Number) // {n,}
case upToN(AST.Atom.Number) // {,n}
case range(AST.Atom.Number, AST.Atom.Number) // {n,m}
}

public enum Kind: String, Hashable {
Expand All @@ -58,7 +58,7 @@ extension AST {

extension AST.Quantification.Amount {
/// The bounds.
public var bounds: (atLeast: Int, atMost: Int?) {
public var bounds: (atLeast: Int?, atMost: Int?) {
switch self {
case .zeroOrMore: return (0, nil)
case .oneOrMore: return (1, nil)
Expand Down
69 changes: 37 additions & 32 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -253,18 +253,18 @@ extension Source {
///
/// Throws on overflow
///
private mutating func lexNumber<Num: FixedWidthInteger>(
_ ty: Num.Type, _ kind: RadixKind
) throws -> Located<Num>? {
private mutating func lexNumber(
_ kind: RadixKind
) throws -> AST.Atom.Number? {
try recordLoc { src in
guard let str = src.tryEatPrefix(kind.characterFilter)?.string else {
guard let str = src.tryEatLocatedPrefix(kind.characterFilter) else {
return nil
}
guard let i = Num(str, radix: kind.radix) else {
throw ParseError.numberOverflow(str)
guard let i = Int(str.value, radix: kind.radix) else {
throw ParseError.numberOverflow(str.value)
}
return i
}
return .init(i, at: str.location)
}.value
}

/// Try to eat a number off the front.
Expand All @@ -273,11 +273,11 @@ extension Source {
///
/// Throws on overflow
///
mutating func lexNumber() throws -> Located<Int>? {
try lexNumber(Int.self, .decimal)
mutating func lexNumber() throws -> AST.Atom.Number? {
try lexNumber(.decimal)
}

mutating func expectNumber() throws -> Located<Int> {
mutating func expectNumber() throws -> AST.Atom.Number {
guard let num = try lexNumber() else {
throw ParseError.expectedNumber("", kind: .decimal)
}
Expand Down Expand Up @@ -488,9 +488,10 @@ extension Source {

if let t = src.lexWhitespace() { trivia.append(t) }

let upperOpt = try src.lexNumber()?.map { upper in
var upperOpt = try src.lexNumber()
if closedRange == false {
// If we have an open range, the upper bound should be adjusted down.
closedRange == true ? upper : upper - 1
upperOpt?.value? -= 1
}

if let t = src.lexWhitespace() { trivia.append(t) }
Expand Down Expand Up @@ -1066,10 +1067,11 @@ extension Source {
///
private mutating func expectPCREVersionNumber(
) throws -> AST.Conditional.Condition.PCREVersionNumber {
let nums = try recordLoc { src -> (major: Int, minor: Int) in
let major = try src.expectNumber().value
let nums = try recordLoc { src -> (major: AST.Atom.Number,
minor: AST.Atom.Number) in
let major = try src.expectNumber()
try src.expect(".")
let minor = try src.expectNumber().value
let minor = try src.expectNumber()
return (major, minor)
}
return .init(major: nums.value.major, minor: nums.value.minor,
Expand Down Expand Up @@ -1119,7 +1121,7 @@ extension Source {
}
if let num = try src.lexNumber() {
return .groupRecursionCheck(
.init(.absolute(num.value), innerLoc: num.location))
.init(.absolute(num), innerLoc: num.location))
}
return .recursionCheck
}
Expand Down Expand Up @@ -1406,20 +1408,21 @@ extension Source {
let kind = try recordLoc { src -> AST.Reference.Kind? in
try src.tryEating { src in
// Note this logic should match canLexNumberedReference.
if src.tryEat("+"), let num = try src.lexNumber() {
return .relative(num.value)
if let plus = src.tryEatWithLoc("+"), let num = try src.lexNumber() {
return .relative(.init(num.value, at: num.location.union(with: plus)))
}
if src.tryEat("-"), let num = try src.lexNumber() {
return .relative(-num.value)
if let minus = src.tryEatWithLoc("-"), let num = try src.lexNumber() {
let val = num.value.map { x in -x }
return .relative(.init(val, at: num.location.union(with: minus)))
}
if let num = try src.lexNumber() {
return .absolute(num.value)
return .absolute(num)
}
return nil
}
}
guard let kind = kind else { return nil }
guard allowWholePatternRef || kind.value != .recurseWholePattern else {
guard allowWholePatternRef || !kind.value.recursesWholePattern else {
throw ParseError.cannotReferToWholePattern
}
let recLevel = allowRecursionLevel ? try lexRecursionLevel() : nil
Expand All @@ -1432,12 +1435,14 @@ extension Source {
/// RecursionLevel -> '+' <Int> | '-' <Int>
///
private mutating func lexRecursionLevel(
) throws -> Located<Int>? {
try recordLoc { src in
) throws -> AST.Atom.Number? {
let value = try recordLoc { src -> Int? in
if src.tryEat("+") { return try src.expectNumber().value }
if src.tryEat("-") { return try -src.expectNumber().value }
if src.tryEat("-") { return try src.expectNumber().value.map { x in -x } }
return nil
}
guard let value = value else { return nil }
return .init(value.value, at: value.location)
}

/// Checks whether a numbered reference can be lexed.
Expand Down Expand Up @@ -1579,9 +1584,8 @@ extension Source {
}

// Backslash followed by a non-0 digit character is a backreference.
if firstChar != "0", let numAndLoc = try src.lexNumber() {
return .backreference(.init(
.absolute(numAndLoc.value), innerLoc: numAndLoc.location))
if firstChar != "0", let num = try src.lexNumber() {
return .backreference(.init(.absolute(num), innerLoc: num.location))
}
return nil
}
Expand Down Expand Up @@ -1621,7 +1625,7 @@ extension Source {
// Whole-pattern recursion, which is equivalent to (?0).
if let loc = src.tryEatWithLoc("R") {
try src.expect(")")
return .subpattern(.init(.recurseWholePattern, innerLoc: loc))
return .subpattern(.init(.recurseWholePattern(loc), innerLoc: loc))
}

// Numbered subpattern reference.
Expand Down Expand Up @@ -1772,11 +1776,12 @@ extension Source {
let arg = try recordLoc { src -> AST.Atom.Callout.PCRE.Argument in
// Parse '(?C' followed by a number.
if let num = try src.lexNumber() {
return .number(num.value)
return .number(num)
}
// '(?C)' is implicitly '(?C0)'.
if src.peek() == ")" {
return .number(0)
let pos = src.currentPosition
return .number(.init(0, at: SourceLocation(pos ..< pos)))
}
// Parse '(C?' followed by a set of balanced delimiters as defined by
// http://pcre.org/current/doc/html/pcre2pattern.html#SEC28
Expand Down
2 changes: 2 additions & 0 deletions Sources/_RegexParser/Regex/Parse/Parse.swift
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,10 @@ struct ParsingContext {
func isPriorGroupRef(_ ref: AST.Reference.Kind) -> Bool {
switch ref {
case .absolute(let i):
guard let i = i.value else { return false }
return i <= priorGroupCount
case .relative(let i):
guard let i = i.value else { return false }
return i < 0
case .named(let str):
return usedGroupNames.contains(str)
Expand Down
9 changes: 5 additions & 4 deletions Sources/_RegexParser/Regex/Parse/Sema.swift
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ extension RegexValidator {
throw error(.unsupported("recursion level"), at: recLevel.location)
}
switch ref.kind {
case .absolute(let i):
case .absolute(let num):
guard let i = num.value else { break }
guard i < captures.captures.count else {
throw error(.invalidReference(i), at: ref.innerLoc)
}
Expand Down Expand Up @@ -359,9 +360,9 @@ extension RegexValidator {
}
switch quant.amount.value {
case .range(let lhs, let rhs):
guard lhs.value <= rhs.value else {
throw error(
.invalidQuantifierRange(lhs.value, rhs.value), at: quant.location)
guard let lhs = lhs.value, let rhs = rhs.value else { break }
guard lhs <= rhs else {
throw error(.invalidQuantifierRange(lhs, rhs), at: quant.location)
}
case .zeroOrMore, .oneOrMore, .zeroOrOne, .exactly, .nOrMore, .upToN:
break
Expand Down
16 changes: 11 additions & 5 deletions Sources/_RegexParser/Regex/Printing/DumpAST.swift
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,12 @@ extension AST.Atom {
}
}

extension AST.Atom.Number: _ASTPrintable {
public var _dumpBase: String {
value.map { "\($0)" } ?? "<invalid>"
}
}

extension AST.Atom.Callout: _ASTPrintable {
public var _dumpBase: String {
switch self {
Expand Down Expand Up @@ -227,7 +233,7 @@ extension AST.Reference: _ASTPrintable {
public var _dumpBase: String {
var result = "\(kind)"
if let recursionLevel = recursionLevel {
result += "\(recursionLevel.value)"
result += "\(recursionLevel)"
}
return result
}
Expand Down Expand Up @@ -270,11 +276,11 @@ extension AST.Quantification.Amount: _ASTPrintable {
case .zeroOrMore: return "zeroOrMore"
case .oneOrMore: return "oneOrMore"
case .zeroOrOne: return "zeroOrOne"
case let .exactly(n): return "exactly<\(n.value)>"
case let .nOrMore(n): return "nOrMore<\(n.value)>"
case let .upToN(n): return "uptoN<\(n.value)>"
case let .exactly(n): return "exactly<\(n)>"
case let .nOrMore(n): return "nOrMore<\(n)>"
case let .upToN(n): return "uptoN<\(n)>"
case let .range(lower, upper):
return ".range<\(lower.value)...\(upper.value)>"
return ".range<\(lower)...\(upper)>"
}
}
}
Expand Down
Loading