Skip to content

Implement instructions for matching builtin character classes and assertions #547

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Aug 3, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3b6b676
Copy over new ascii bitset
rctcwyvrn Jul 5, 2022
33caa79
Add matchBuiltin
rctcwyvrn Jul 5, 2022
139daa5
Remove debug prints
rctcwyvrn Jul 5, 2022
9abf4af
Remove bitset fast path
rctcwyvrn Jul 5, 2022
286f5d8
Fully remove remnants of the bitset fast path
rctcwyvrn Jul 6, 2022
9e915cd
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 7, 2022
e593ddb
Completely replace AssertionFunction with regexAssert(by:)
rctcwyvrn Jul 11, 2022
25dc277
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 12, 2022
3e38ac6
Cleanup
rctcwyvrn Jul 12, 2022
e5d8b4a
Move match builtin and assert + Add AssertionPayload
rctcwyvrn Jul 12, 2022
0466c25
Cleanup assertions
rctcwyvrn Jul 12, 2022
87078ad
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 12, 2022
f401e84
Fix tests
rctcwyvrn Jul 13, 2022
b09f45f
Update opcode description for assertBy
rctcwyvrn Jul 13, 2022
c581ea2
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 14, 2022
2a82231
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 15, 2022
fb1576a
Update branch to match main
rctcwyvrn Jul 15, 2022
3b9485e
Use the newly cleaned up _CharacterClassModel
rctcwyvrn Jul 16, 2022
64d1ed9
Add characterClass DSLTree node
rctcwyvrn Jul 16, 2022
2a6fe3c
Bugfixes
rctcwyvrn Jul 19, 2022
206bfc6
Add documentation for matchBuiltin
rctcwyvrn Jul 21, 2022
b53f524
Lots of cleanup
rctcwyvrn Jul 25, 2022
bb5245f
Move assertion payload
rctcwyvrn Jul 25, 2022
0746847
More minor cleanup
rctcwyvrn Jul 25, 2022
c718543
Perform boundary check for .anyScalar when in grapheme mode
rctcwyvrn Jul 25, 2022
d35b578
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 28, 2022
ca8acf2
Merge branch 'main' into speedy-builtins
rctcwyvrn Aug 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Cleanup
  • Loading branch information
rctcwyvrn committed Jul 12, 2022
commit 3e38ac6026b956396a8eac55b2dfd79892968cf7
14 changes: 4 additions & 10 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@ fileprivate extension Compiler.ByteCodeGen {
case let .unconverted(astAtom):
if optimizationsEnabled,
let cc = astAtom.ast.characterClass?.builtinCC {
emitBuiltinCharacterClass(cc)
builder.buildMatchBuiltin(
cc,
cc.isStrict(options: options),
isScalar: options.semanticLevel == .unicodeScalar)
return
}
if let consumer = try astAtom.ast.generateConsumer(options) {
Expand Down Expand Up @@ -114,15 +117,6 @@ fileprivate extension Compiler.ByteCodeGen {
throw Unsupported("Backreference kind: \(ref)")
}
}

mutating func emitBuiltinCharacterClass(
_ cc: BuiltinCC
) {
builder.buildMatchBuiltin(
cc,
cc.isStrict(options: options),
isScalar: options.semanticLevel == .unicodeScalar)
}

mutating func emitAssertion(
_ kind: AST.Atom.AssertionKind
Expand Down
1 change: 1 addition & 0 deletions Sources/_StringProcessing/Compiler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ func _compileRegex(
) throws -> Executor {
let ast = try parse(regex, syntax)
let dsl: DSLTree

switch semanticLevel?.base {
case .graphemeCluster:
let sequence = AST.MatchingOptionSequence(adding: [.init(.graphemeClusterSemantics, location: .fake)])
Expand Down
4 changes: 0 additions & 4 deletions Sources/_StringProcessing/Engine/Instruction.swift
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,6 @@ extension Instruction {
/// Operand: Ascii bitset register containing the bitset
case matchBitset

/// TODO: builtin assertions and anchors
case builtinAssertion

/// TODO: builtin character classes
case matchBuiltin

// MARK: Extension points
Expand Down
6 changes: 0 additions & 6 deletions Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -179,12 +179,6 @@ extension MEProgram.Builder {
.consumeBy, .init(consumer: makeConsumeFunction(p))))
}

// mutating func buildAssert(
// by p: @escaping MEProgram.AssertionFunction
// ) {
// instructions.append(.init(
// .assertBy, .init(assertion: makeAssertionFunction(p))))
// }
mutating func buildAssert(
by kind: AST.Atom.AssertionKind,
_ anchorsMatchNewlines: Bool,
Expand Down
9 changes: 0 additions & 9 deletions Sources/_StringProcessing/Engine/MEBuiltins.swift

This file was deleted.

8 changes: 0 additions & 8 deletions Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -262,8 +262,6 @@ extension Processor {
var matched: Bool
var next = input.index(after: currentPosition)
switch cc {
// lily note: when do these `any` cases appear? can they be compiled
// into consume instructions at compile time?
case .any, .anyGrapheme: matched = true
case .anyScalar:
matched = true
Expand Down Expand Up @@ -320,8 +318,6 @@ extension Processor {
matched = c.isNewline && (c.isASCII || !isStrictAscii)
case .newlineSequence:
matched = c.isNewline && (c.isASCII || !isStrictAscii)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was carried over from the old code, but why does this match a full \r\n sequence when in unicode scalars mode? Should we change that?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's what is specified in the Unicode proposal:

Additionally, \R and CharacterClass.newline provide a way to include the CR+LF pair, even when matching with Unicode scalar semantics.

I don't know the exact rationale behind it, but I would assume it's done in an effort to be compatible with other regex engines that match scalars by default but with \R still matching \r\n.

// lily note: what exactly is this doing? matching a full cr-lf character
// even though its in scalar mode? why?
if c == "\r" && next != input.endIndex && input.unicodeScalars[next] == "\n" {
input.unicodeScalars.formIndex(after: &next)
}
Expand Down Expand Up @@ -426,7 +422,6 @@ extension Processor {
if usesSimpleUnicodeBoundaries {
// TODO: How should we handle bounds?
return atSimpleBoundary(usesASCIIWord, semanticLevel)
// lily note: there appear to be no test cases that use this option, ping alex to ask what they should look like
} else {
return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex)
}
Expand Down Expand Up @@ -716,9 +711,6 @@ extension Processor {
storedCaptures[capNum].registerValue(
value, overwriteInitial: sp)
controller.step()

case .builtinAssertion:
builtinAssertion()
}
}
}
Expand Down
37 changes: 3 additions & 34 deletions Sources/_StringProcessing/_CharacterClassModel.swift
Original file line number Diff line number Diff line change
Expand Up @@ -564,35 +564,6 @@ extension DSLTree.CustomCharacterClass {
}
}

extension _CharacterClassModel {
// FIXME: Calling on inverted sets wont be the same as the
// inverse of a boundary if at the start or end of the
// string. (Think through what we want: do it ourselves or
// give the caller both options).
func isBoundary(
_ input: String,
at pos: String.Index,
bounds: Range<String.Index>,
with options: MatchingOptions
) -> Bool {
// FIXME: How should we handle bounds?
// We probably need two concepts
if bounds.isEmpty { return false }
if pos == bounds.lowerBound {
return self.matches(in: input, at: pos, with: options) != nil
}
let priorIdx = input.index(before: pos)
if pos == bounds.upperBound {
return self.matches(in: input, at: priorIdx, with: options) != nil
}

let prior = self.matches(in: input, at: priorIdx, with: options) != nil
let current = self.matches(in: input, at: pos, with: options) != nil
return prior != current
}

}

internal enum BuiltinCC: UInt64 {
case any = 1
case anyGrapheme
Expand Down Expand Up @@ -622,11 +593,9 @@ extension BuiltinCC {

extension _CharacterClassModel {
internal var builtinCC: BuiltinCC? {
if isInverted { return nil } // lily todo: add another flag to the payload? when is this set? why are there so many weird edge cases in ccm? it feels like it's trying to model both builtins and custom models

// in that case, should we just convert a ccm to a ccc
// if it has these weird flags set?
// completely remove ccm from compilation and just emit either a builtincc or a ccc or an advance
// Future work: Make CCM always either a BuiltinCC or convertable to a
// custom character class
if isInverted { return nil }
switch self.cc {
case .any:
return .any
Expand Down