Skip to content

Emit custom character classes like an alternation #590

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
3b6b676
Copy over new ascii bitset
rctcwyvrn Jul 5, 2022
33caa79
Add matchBuiltin
rctcwyvrn Jul 5, 2022
139daa5
Remove debug prints
rctcwyvrn Jul 5, 2022
9abf4af
Remove bitset fast path
rctcwyvrn Jul 5, 2022
286f5d8
Fully remove remnants of the bitset fast path
rctcwyvrn Jul 6, 2022
9e915cd
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 7, 2022
e593ddb
Completely replace AssertionFunction with regexAssert(by:)
rctcwyvrn Jul 11, 2022
25dc277
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 12, 2022
3e38ac6
Cleanup
rctcwyvrn Jul 12, 2022
e5d8b4a
Move match builtin and assert + Add AssertionPayload
rctcwyvrn Jul 12, 2022
0466c25
Cleanup assertions
rctcwyvrn Jul 12, 2022
87078ad
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 12, 2022
f401e84
Fix tests
rctcwyvrn Jul 13, 2022
b09f45f
Update opcode description for assertBy
rctcwyvrn Jul 13, 2022
c581ea2
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 14, 2022
2a82231
Merge branch 'main' into speedy-builtins
rctcwyvrn Jul 15, 2022
fb1576a
Update branch to match main
rctcwyvrn Jul 15, 2022
3b9485e
Use the newly cleaned up _CharacterClassModel
rctcwyvrn Jul 16, 2022
64d1ed9
Add characterClass DSLTree node
rctcwyvrn Jul 16, 2022
2a6fe3c
Bugfixes
rctcwyvrn Jul 19, 2022
206bfc6
Add documentation for matchBuiltin
rctcwyvrn Jul 21, 2022
b53f524
Lots of cleanup
rctcwyvrn Jul 25, 2022
bb5245f
Move assertion payload
rctcwyvrn Jul 25, 2022
0746847
More minor cleanup
rctcwyvrn Jul 25, 2022
c718543
Perform boundary check for .anyScalar when in grapheme mode
rctcwyvrn Jul 25, 2022
3f0ece5
Emit custom character classes via saves and branches
rctcwyvrn Jul 25, 2022
79aabab
Add some comments
rctcwyvrn Jul 26, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Lots of cleanup
- static vars in payloads
- Clean up _CharacterClassModel
- Use the model for bytecodegen and consumer interface
- Merge the grapheme and scalar match builtin cases together
  • Loading branch information
rctcwyvrn committed Jul 25, 2022
commit b53f52481727baf8a92026828d3bb9fe9ca25835
5 changes: 1 addition & 4 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -166,10 +166,7 @@ fileprivate extension Compiler.ByteCodeGen {
}

mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) {
builder.buildMatchBuiltin(
cc.model,
cc.model.isStrictAscii(options: options),
isScalar: options.semanticLevel == .unicodeScalar)
builder.buildMatchBuiltin(model: cc.asRuntimeModel(options))
}

mutating func emitMatchScalar(_ s: UnicodeScalar) {
Expand Down
5 changes: 2 additions & 3 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -186,10 +186,9 @@ extension DSLTree.Atom {

extension DSLTree.Atom.CharacterClass {
func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction {
let model = asRuntimeModel(opts)
return { input, bounds in
// FIXME: should we worry about out of bounds?
model.withMatchLevel(opts.matchLevel)
.matches(in: input, at: bounds.lowerBound, with: opts)
model.matches(in: input, at: bounds.lowerBound)
}
}
}
Expand Down
42 changes: 23 additions & 19 deletions Sources/_StringProcessing/Engine/InstPayload.swift
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,8 @@ extension Instruction.Payload {
return (isScalar: pair.0 == 1, pair.1)
}

init(_ cc: _CharacterClassModel.Representation, _ isInverted: Bool, _ isStrict: Bool, _ isScalar: Bool) {
self.init(CharacterClassPayload(cc, isInverted, isStrict, isScalar).rawValue)
init(_ model: _CharacterClassModel) {
self.init(CharacterClassPayload(model).rawValue)
}
var characterClassPayload: CharacterClassPayload{
return CharacterClassPayload(rawValue: rawValue & _payloadMask)
Expand Down Expand Up @@ -350,33 +350,37 @@ struct CharacterClassPayload: RawRepresentable {
let rawValue: UInt64
// Layout:
// Top three bits are isInverted, isStrict, isScalar
// Lower 16 bits are _CCM.Representation
static let invertedShift: UInt64 = 55
static let strictShift: UInt64 = 54
static let scalarShift: UInt64 = 53
static let ccMask: UInt64 = 0xFF
// Lower 8 bits are _CCM.Representation
static var invertedBit: UInt64 { 1 << 55 }
static var strictASCIIBit: UInt64 { 1 << 54 }
static var scalarBit: UInt64 { 1 << 53 }
static var ccMask: UInt64 { 0xFF }
init(rawValue: UInt64) {
assert(rawValue & _opcodeMask == 0)
self.rawValue = rawValue
}
init(_ cc: _CharacterClassModel.Representation, _ isInverted: Bool, _ isStrict: Bool, _ isScalar: Bool) {
let invertedBit = isInverted ? 1 << CharacterClassPayload.invertedShift : 0
let strictBit = isStrict ? 1 << CharacterClassPayload.strictShift : 0
let scalarBit = isScalar ? 1 << CharacterClassPayload.scalarShift : 0
assert(cc.rawValue <= CharacterClassPayload.ccMask) //
self.init(rawValue: cc.rawValue + UInt64(invertedBit) + UInt64(strictBit) + UInt64(scalarBit))
init(_ model: _CharacterClassModel) {
let invertedBit = model.isInverted ? CharacterClassPayload.invertedBit : 0
let strictASCIIBit = model.isStrictAscii ? CharacterClassPayload.strictASCIIBit : 0
let scalarBit = model.matchLevel == .unicodeScalar ? CharacterClassPayload.scalarBit : 0
assert(model.cc.rawValue <= CharacterClassPayload.ccMask)
assert(model.cc.rawValue & invertedBit & strictASCIIBit & scalarBit == 0) // Sanity check
self.init(rawValue: model.cc.rawValue | invertedBit | strictASCIIBit | scalarBit)
}

var isInverted: Bool {
(self.rawValue >> CharacterClassPayload.invertedShift) & 1 == 1
self.rawValue & CharacterClassPayload.invertedBit != 0
}
var isStrict: Bool {
(self.rawValue >> CharacterClassPayload.strictShift) & 1 == 1
/// Represents if the given character class should strictly only match ascii values based on the options given
/// See Oniguruma options: (?D) (?\P) (?S) (?W)
var isStrictASCII: Bool {
self.rawValue & CharacterClassPayload.strictASCIIBit != 0
}
var isScalar: Bool {
(self.rawValue >> CharacterClassPayload.scalarShift) & 1 == 1
var isScalarSemantics: Bool {
self.rawValue & CharacterClassPayload.scalarBit != 0
}
var cc: _CharacterClassModel.Representation {
_CharacterClassModel.Representation.init(rawValue: self.rawValue & CharacterClassPayload.ccMask)!
_CharacterClassModel.Representation.init(
rawValue: self.rawValue & CharacterClassPayload.ccMask).unsafelyUnwrapped
}
}
8 changes: 2 additions & 6 deletions Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,9 @@ extension MEProgram.Builder {
.matchBitset, .init(bitset: makeAsciiBitset(b), isScalar: true)))
}

mutating func buildMatchBuiltin(
_ cc: _CharacterClassModel,
_ isStrict: Bool,
isScalar: Bool
) {
mutating func buildMatchBuiltin(model: _CharacterClassModel) {
instructions.append(.init(
.matchBuiltin, .init(cc.cc, cc.isInverted, isStrict, isScalar)))
.matchBuiltin, .init(model)))
}

mutating func buildConsume(
Expand Down
127 changes: 61 additions & 66 deletions Sources/_StringProcessing/Engine/MEBuiltins.swift
Original file line number Diff line number Diff line change
Expand Up @@ -4,78 +4,66 @@ extension Processor {
mutating func matchBuiltin(
_ cc: _CharacterClassModel.Representation,
_ isInverted: Bool,
_ isStrictAscii: Bool
_ isStrictASCII: Bool,
_ isScalarSemantics: Bool
) -> Bool {
guard let c = load() else {
guard let char = load(), let scalar = loadScalar() else {
signalFailure()
return false
}

var asciiCheck: Bool {
(char.isASCII && !isScalarSemantics)
|| (scalar.isASCII && isScalarSemantics)
|| !isStrictASCII
}
var matched: Bool
var next = input.index(after: currentPosition)
switch cc {
case .any, .anyGrapheme: matched = true
case .anyScalar:
matched = true
var next: Input.Index
if isScalarSemantics {
next = input.unicodeScalars.index(after: currentPosition)
case .digit:
matched = c.isNumber && (c.isASCII || !isStrictAscii)
case .horizontalWhitespace:
matched = c.unicodeScalars.first?.isHorizontalWhitespace == true
&& (c.isASCII || !isStrictAscii)
case .newlineSequence, .verticalWhitespace:
matched = c.unicodeScalars.first?.isNewline == true
&& (c.isASCII || !isStrictAscii)
case .whitespace:
matched = c.isWhitespace && (c.isASCII || !isStrictAscii)
case .word:
matched = c.isWordCharacter && (c.isASCII || !isStrictAscii)
}
if isInverted {
matched.toggle()
}
if matched {
currentPosition = next
return true
} else {
signalFailure()
return false
}
}

mutating func matchBuiltinScalar(
_ cc: _CharacterClassModel.Representation,
_ isInverted: Bool,
_ isStrictAscii: Bool
) -> Bool {
guard let c = loadScalar() else {
signalFailure()
return false
next = input.index(after: currentPosition)
}

var matched: Bool
var next = input.unicodeScalars.index(after: currentPosition)
switch cc {
case .any: matched = true
case .anyScalar: matched = true
case .any:
matched = true
case .anyGrapheme:
matched = true
next = input.index(after: currentPosition)
case .anyScalar:
// FIXME: This allows us to be not-scalar aligned when in grapheme mode
// Should this even be allowed?
matched = true
next = input.unicodeScalars.index(after: currentPosition)
case .digit:
matched = c.properties.numericType != nil && (c.isASCII || !isStrictAscii)
if isScalarSemantics {
matched = scalar.properties.numericType != nil
} else {
matched = char.isNumber && asciiCheck
}
case .horizontalWhitespace:
matched = c.isHorizontalWhitespace && (c.isASCII || !isStrictAscii)
matched = scalar.isHorizontalWhitespace && asciiCheck
case .verticalWhitespace:
matched = c.isNewline && (c.isASCII || !isStrictAscii)
matched = scalar.isNewline && asciiCheck
case .newlineSequence:
matched = c.isNewline && (c.isASCII || !isStrictAscii)
if c == "\r" && next != input.endIndex && input.unicodeScalars[next] == "\n" {
matched = scalar.isNewline && asciiCheck
if isScalarSemantics && matched && scalar == "\r"
&& next != input.endIndex && input.unicodeScalars[next] == "\n" {
// Match a full CR-LF sequence even in scalar sematnics
input.unicodeScalars.formIndex(after: &next)
}
case .whitespace:
matched = c.properties.isWhitespace && (c.isASCII || !isStrictAscii)
if isScalarSemantics {
matched = scalar.properties.isWhitespace && asciiCheck
} else {
matched = char.isWhitespace && asciiCheck
}
case .word:
matched = (c.properties.isAlphabetic || c == "_") && (c.isASCII || !isStrictAscii)
if isScalarSemantics {
matched = scalar.properties.isAlphabetic && asciiCheck
} else {
matched = char.isWordCharacter && asciiCheck
}
}
if isInverted {
matched.toggle()
Expand Down Expand Up @@ -176,43 +164,50 @@ extension Processor {
}

struct AssertionPayload: RawRepresentable {
var _assertionKindMask: UInt64 { ~0xFFF0_0000_0000_0000 }
var _opcodeMask: UInt64 { 0xFF00_0000_0000_0000 }

let rawValue: UInt64

init(rawValue: UInt64) {
self.rawValue = rawValue
assert(rawValue & _opcodeMask == 0)
}

static var anchorBit: UInt64 { 1 << 55 }
static var boundaryBit: UInt64 { 1 << 54 }
static var strictASCIIWordBit: UInt64 { 1 << 53 }
static var isScalarBit: UInt64 { 1 << 52 }
static var assertionKindMask: UInt64 { 0xFF }

init(_ assertion: DSLTree.Atom.Assertion,
_ anchorsMatchNewlines: Bool,
_ usesSimpleUnicodeBoundaries: Bool,
_ usesASCIIWord: Bool,
_ semanticLevel: MatchingOptions.SemanticLevel
) {
// 4 bits of options
let anchorBit: UInt64 = anchorsMatchNewlines ? (1 << 55) : 0
let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? (1 << 54) : 0
let strictBit: UInt64 = usesASCIIWord ? (1 << 53) : 0
let semanticLevelBit: UInt64 = semanticLevel == .unicodeScalar ? (1 << 52) : 0
let optionsBits: UInt64 = anchorBit + boundaryBit + strictBit + semanticLevelBit
let anchorBit: UInt64 = anchorsMatchNewlines ? AssertionPayload.anchorBit : 0
let boundaryBit: UInt64 = usesSimpleUnicodeBoundaries ? AssertionPayload.boundaryBit : 0
let strictASCIIWordBit: UInt64 = usesASCIIWord ? AssertionPayload.strictASCIIWordBit : 0
let isScalarBit: UInt64 = semanticLevel == .unicodeScalar ? AssertionPayload.isScalarBit : 0

// 4 bits for the assertion kind
// 8 bits for the assertion kind
// Future work: Optimize this layout
let kind = assertion.rawValue
self.init(rawValue: kind + optionsBits)
assert(kind <= AssertionPayload.assertionKindMask)
assert(kind & anchorBit & boundaryBit & strictASCIIWordBit & isScalarBit == 0)
self.init(rawValue: kind | anchorBit | boundaryBit | strictASCIIWordBit | isScalarBit)
}

var kind: DSLTree.Atom.Assertion {
return .init(rawValue: self.rawValue & _assertionKindMask)!
return .init(
rawValue: self.rawValue & AssertionPayload.assertionKindMask).unsafelyUnwrapped
}
var anchorsMatchNewlines: Bool { self.rawValue & AssertionPayload.anchorBit != 0 }
var usesSimpleUnicodeBoundaries: Bool {
self.rawValue & AssertionPayload.boundaryBit != 0
}
var anchorsMatchNewlines: Bool { (self.rawValue >> 55) & 1 == 1 }
var usesSimpleUnicodeBoundaries: Bool { (self.rawValue >> 54) & 1 == 1 }
var usesASCIIWord: Bool { (self.rawValue >> 53) & 1 == 1 }
var usesASCIIWord: Bool { self.rawValue & AssertionPayload.strictASCIIWordBit != 0 }
var semanticLevel: MatchingOptions.SemanticLevel {
if (self.rawValue >> 52) & 1 == 1 {
if self.rawValue & AssertionPayload.isScalarBit != 0 {
return .unicodeScalar
} else {
return .graphemeCluster
Expand Down
15 changes: 7 additions & 8 deletions Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -479,14 +479,13 @@ extension Processor {

case .matchBuiltin:
let payload = payload.characterClassPayload
if payload.isScalar {
if matchBuiltinScalar(payload.cc, payload.isInverted, payload.isStrict) {
controller.step()
}
} else {
if matchBuiltin(payload.cc, payload.isInverted, payload.isStrict) {
controller.step()
}
if matchBuiltin(
payload.cc,
payload.isInverted,
payload.isStrictASCII,
payload.isScalarSemantics
) {
controller.step()
}

case .consumeBy:
Expand Down
12 changes: 0 additions & 12 deletions Sources/_StringProcessing/MatchingOptions.swift
Original file line number Diff line number Diff line change
Expand Up @@ -122,18 +122,6 @@ extension MatchingOptions {
}
}

// Deprecated CharacterClass.MatchLevel API
extension MatchingOptions {
var matchLevel: _CharacterClassModel.MatchLevel {
switch semanticLevel {
case .graphemeCluster:
return .graphemeCluster
case .unicodeScalar:
return .unicodeScalar
}
}
}

// MARK: - Implementation
extension MatchingOptions {
/// An option that changes the behavior of a regular expression.
Expand Down
Loading