Skip to content

Optimize matching to match on scalar values when possible #525

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 31 commits into from
Jul 12, 2022
Merged
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
e518bdf
Add unicode and dna benchmarks
rctcwyvrn Jun 23, 2022
eb64036
Fixup naming, turn off firstmatch, add benchmark filtering by regex
rctcwyvrn Jun 23, 2022
1237f9a
Make --exclude-ns actually work correctly
rctcwyvrn Jun 23, 2022
fdf2c23
Add usage comment for generateFasta
rctcwyvrn Jun 23, 2022
eeb38e9
First ver
rctcwyvrn Jun 28, 2022
6f76f36
Merge branch 'main' into match-scalar
rctcwyvrn Jun 29, 2022
3a2b324
Remove matchseq entirely
rctcwyvrn Jun 29, 2022
df8919e
Merge branch 'just-one-more-benchmark-suite' into temp
rctcwyvrn Jun 29, 2022
c2ee8cc
Finish up matchScalar
rctcwyvrn Jun 29, 2022
809b085
Factor out nextScalarIndex for matchBitsetScalar
rctcwyvrn Jun 29, 2022
06c77c7
Add scalar mode support for matching bitsets + fix bug
rctcwyvrn Jun 29, 2022
e3d7ad7
Emit matchScalar in quotedLiteral when in unicode scalar mode
rctcwyvrn Jun 29, 2022
3e1e088
Add tests
rctcwyvrn Jun 29, 2022
b4c7c8c
Cleanup
rctcwyvrn Jun 30, 2022
5359e31
Revert "Merge branch 'just-one-more-benchmark-suite' into temp"
rctcwyvrn Jun 30, 2022
6e4c2bd
Cleanup
rctcwyvrn Jul 4, 2022
1a359b4
Add case-insensitive match instructions
rctcwyvrn Jul 4, 2022
097ffeb
Remove extra instructions and use payload bits instead
rctcwyvrn Jul 4, 2022
76667a2
Comment out compiletests for now
rctcwyvrn Jul 4, 2022
6a1f6e9
Fix compile tests
rctcwyvrn Jul 5, 2022
fce8f9a
Merge branch 'main' into scalar-optimizations-clean
rctcwyvrn Jul 7, 2022
0860368
Fix scalar matching in grapheme semantic mode
hamishknight Jul 8, 2022
c6bc811
Preserve scalar syntax in DSL conversion
hamishknight Jul 8, 2022
bca1d2b
Change scalar semantics to match #565
rctcwyvrn Jul 11, 2022
79e60ac
Merge branch 'not-to-scale' into scalar-optimizations-clean
rctcwyvrn Jul 11, 2022
22cc9d5
Add edge case test
rctcwyvrn Jul 11, 2022
7a9923d
Always match .scalar under grapheme semantics
rctcwyvrn Jul 11, 2022
47f8e66
Merge branch 'main' into scalar-optimizations-clean
rctcwyvrn Jul 11, 2022
7aa98d1
Add new instructions to compile tests
rctcwyvrn Jul 11, 2022
113cfe3
Add an XFAIL test for scalar coalescing
rctcwyvrn Jul 12, 2022
9949b8e
Fix XCTExpectFailure for linux
rctcwyvrn Jul 12, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Fix compile tests
  • Loading branch information
rctcwyvrn committed Jul 5, 2022
commit 6a1f6e9131e0092bedd6866d7cd85840108886a8
312 changes: 216 additions & 96 deletions Tests/RegexTests/CompileTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,125 @@

import XCTest

enum DecodedInstr {
case invalid
case moveImmediate
case branch
case condBranchZeroElseDecrement
case save
case saveAddress
case splitSaving
case clear
case clearThrough
case accept
case fail
case advance
case match
case matchCaseInsensitive
case matchScalar
case matchScalarCaseInsensitiveUnchecked
case matchScalarCaseInsensitive
case matchScalarUnchecked
case matchBitsetScalar
case matchBitset
case consumeBy
case assertBy
case matchBy
case backreference
case beginCapture
case endCapture
case transformCapture
case captureValue
case builtinAssertion
case builtinCharacterClass
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Future: this will just be replaced with the encoding/decoding infrastructure


extension DecodedInstr {
/// Decode the given instruction by looking at the opcode and payload, expanding out certain instructions
/// like matchScalar and match into their variants
///
/// Must stay in sync with Processor.cycle
static func decode(_ instruction: Instruction) -> DecodedInstr {
let (opcode, payload) = instruction.destructure

switch opcode {
case .invalid:
fatalError("Invalid program")
case .moveImmediate:
return .moveImmediate
case .branch:
return .branch
case .condBranchZeroElseDecrement:
return .condBranchZeroElseDecrement
case .save:
return .save
case .saveAddress:
return .saveAddress
case .splitSaving:
return .splitSaving
case .clear:
return .clear
case .clearThrough:
return .clearThrough
case .accept:
return .accept
case .fail:
return .fail
case .advance:
return .advance
case .match:
let (isCaseInsensitive, _) = payload.elementPayload
if isCaseInsensitive {
return .matchCaseInsensitive
} else {
return .match
}
case .matchScalar:
let (_, caseInsensitive, boundaryCheck) = payload.scalarPayload
if caseInsensitive {
if boundaryCheck {
return .matchScalarCaseInsensitive
} else {
return .matchScalarCaseInsensitiveUnchecked
}
} else {
if boundaryCheck {
return .matchScalar
} else {
return .matchScalarUnchecked
}
}
case .matchBitset:
let (isScalar, _) = payload.bitsetPayload
if isScalar {
return .matchBitsetScalar
} else {
return .matchBitset
}
case .consumeBy:
return consumeBy
case .assertBy:
return .assertBy
case .matchBy:
return .matchBy
case .backreference:
return .backreference
case .beginCapture:
return .beginCapture
case .endCapture:
return .endCapture
case .transformCapture:
return .transformCapture
case .captureValue:
return .captureValue
case .builtinAssertion:
return .builtinAssertion
case .builtinCharacterClass:
return .builtinCharacterClass
}
}
}

extension RegexTests {

private func testCompilationEquivalence(
Expand Down Expand Up @@ -147,20 +266,21 @@ extension RegexTests {
for regex: String,
syntax: SyntaxOptions = .traditional,
semanticLevel: RegexSemanticLevel? = nil,
contains targets: Set<Instruction.OpCode> = [],
doesNotContain invalid: Set<Instruction.OpCode> = [],
contains targets: Set<DecodedInstr> = [],
doesNotContain invalid: Set<DecodedInstr> = [],
file: StaticString = #file,
line: UInt = #line
) {
do {
let prog = try _compileRegex(regex, syntax, semanticLevel)
var found: Set<Instruction.OpCode> = []
var found: Set<DecodedInstr> = []
for inst in prog.engine.instructions {
found.insert(inst.opcode)
let decoded = DecodedInstr.decode(inst)
found.insert(decoded)

if invalid.contains(inst.opcode) {
if invalid.contains(decoded) {
XCTFail(
"Compiled regex '\(regex)' contains incorrect opcode \(inst.opcode)",
"Compiled regex '\(regex)' contains incorrect opcode \(decoded)",
file: file,
line: line)
return
Expand All @@ -181,94 +301,94 @@ extension RegexTests {
}
}

// func testBitsetCompile() {
// expectProgram(
// for: "[abc]",
// contains: [.matchBitset],
// doesNotContain: [.consumeBy, .matchBitsetScalar])
// expectProgram(
// for: "[abc]",
// semanticLevel: .unicodeScalar,
// contains: [.matchBitsetScalar],
// doesNotContain: [.matchBitset, .consumeBy])
// }
//
// func testScalarOptimizeCompilation() {
// // all ascii quoted literal -> elide boundary checks
// expectProgram(
// for: "abcd",
// contains: [.matchScalar, .matchScalarUnchecked],
// doesNotContain: [.match, .matchSequence, .consumeBy])
// // ascii character -> matchScalar with boundary check
// expectProgram(
// for: "a",
// contains: [.matchScalar],
// doesNotContain: [.match, .matchSequence, .consumeBy, .matchScalarUnchecked])
// // quoted literal is not all ascii -> match scalar when possible, always do boundary checks
// expectProgram(
// for: "aaa\u{301}",
// contains: [.match, .matchScalar],
// doesNotContain: [.consumeBy, .matchScalarUnchecked])
// // scalar mode -> always emit match scalar without boundary checks
// expectProgram(
// for: "abcd",
// semanticLevel: .unicodeScalar,
// contains: [.matchScalarUnchecked],
// doesNotContain: [.match, .matchSequence, .consumeBy, .matchScalar])
// expectProgram(
// for: "a",
// semanticLevel: .unicodeScalar,
// contains: [.matchScalarUnchecked],
// doesNotContain: [.match, .matchSequence, .consumeBy, .matchScalar])
// expectProgram(
// for: "aaa\u{301}",
// semanticLevel: .unicodeScalar,
// contains: [.matchScalarUnchecked],
// doesNotContain: [.match, .matchSequence, .consumeBy, .matchScalar])
// }
//
// func testCaseInsensitivityCompilation() {
// // quoted literal is all ascii -> match scalar case insensitive and skip
// // boundary checks
// expectProgram(
// for: "(?i)abcd",
// contains: [.matchScalarCaseInsensitiveUnchecked, .matchScalarCaseInsensitive],
// doesNotContain: [.match, .matchCaseInsensitive, .matchScalar, .matchScalarUnchecked])
// // quoted literal is all non-cased ascii -> emit match scalar instructions
// expectProgram(
// for: "(?i)&&&&",
// contains: [.matchScalar, .matchScalarUnchecked],
// doesNotContain: [.match, .matchCaseInsensitive,
// .matchScalarCaseInsensitive, .matchScalarCaseInsensitiveUnchecked])
// // quoted literal is not all ascii -> match scalar case insensitive when
// // possible, match character case insensitive when needed, always perform
// // boundary check
// expectProgram(
// for: "(?i)abcd\u{301}",
// contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive],
// doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match, .matchScalar])
// // same as before but contains ascii non cased characters -> emit matchScalar for them
// expectProgram(
// for: "(?i)abcd\u{301};.'!",
// contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar],
// doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match])
// // contains non-ascii non-cased characters -> emit match
// expectProgram(
// for: "(?i)abcd\u{301};.'!💖",
// contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar, .match],
// doesNotContain: [.matchScalarCaseInsensitiveUnchecked])
//
// // scalar mode -> emit unchecked scalar match only, emit case insensitive
// // only if the scalar is cased
// expectProgram(
// for: "(?i);.'!💖",
// semanticLevel: .unicodeScalar,
// contains: [.matchScalarUnchecked],
// doesNotContain: [.matchScalarCaseInsensitiveUnchecked])
// expectProgram(
// for: "(?i)abcdé",
// semanticLevel: .unicodeScalar,
// contains: [.matchScalarCaseInsensitiveUnchecked],
// doesNotContain: [.matchScalarUnchecked])
// }
func testBitsetCompile() {
expectProgram(
for: "[abc]",
contains: [.matchBitset],
doesNotContain: [.consumeBy, .matchBitsetScalar])
expectProgram(
for: "[abc]",
semanticLevel: .unicodeScalar,
contains: [.matchBitsetScalar],
doesNotContain: [.matchBitset, .consumeBy])
}

func testScalarOptimizeCompilation() {
// all ascii quoted literal -> elide boundary checks
expectProgram(
for: "abcd",
contains: [.matchScalar, .matchScalarUnchecked],
doesNotContain: [.match, .consumeBy])
// ascii character -> matchScalar with boundary check
expectProgram(
for: "a",
contains: [.matchScalar],
doesNotContain: [.match, .consumeBy, .matchScalarUnchecked])
// quoted literal is not all ascii -> match scalar when possible, always do boundary checks
expectProgram(
for: "aaa\u{301}",
contains: [.match, .matchScalar],
doesNotContain: [.consumeBy, .matchScalarUnchecked])
// scalar mode -> always emit match scalar without boundary checks
expectProgram(
for: "abcd",
semanticLevel: .unicodeScalar,
contains: [.matchScalarUnchecked],
doesNotContain: [.match, .consumeBy, .matchScalar])
expectProgram(
for: "a",
semanticLevel: .unicodeScalar,
contains: [.matchScalarUnchecked],
doesNotContain: [.match, .consumeBy, .matchScalar])
expectProgram(
for: "aaa\u{301}",
semanticLevel: .unicodeScalar,
contains: [.matchScalarUnchecked],
doesNotContain: [.match, .consumeBy, .matchScalar])
}

func testCaseInsensitivityCompilation() {
// quoted literal is all ascii -> match scalar case insensitive and skip
// boundary checks
expectProgram(
for: "(?i)abcd",
contains: [.matchScalarCaseInsensitiveUnchecked, .matchScalarCaseInsensitive],
doesNotContain: [.match, .matchCaseInsensitive, .matchScalar, .matchScalarUnchecked])
// quoted literal is all non-cased ascii -> emit match scalar instructions
expectProgram(
for: "(?i)&&&&",
contains: [.matchScalar, .matchScalarUnchecked],
doesNotContain: [.match, .matchCaseInsensitive,
.matchScalarCaseInsensitive, .matchScalarCaseInsensitiveUnchecked])
// quoted literal is not all ascii -> match scalar case insensitive when
// possible, match character case insensitive when needed, always perform
// boundary check
expectProgram(
for: "(?i)abcd\u{301}",
contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive],
doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match, .matchScalar])
// same as before but contains ascii non cased characters -> emit matchScalar for them
expectProgram(
for: "(?i)abcd\u{301};.'!",
contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar],
doesNotContain: [.matchScalarCaseInsensitiveUnchecked, .match])
// contains non-ascii non-cased characters -> emit match
expectProgram(
for: "(?i)abcd\u{301};.'!💖",
contains: [.matchCaseInsensitive, .matchScalarCaseInsensitive, .matchScalar, .match],
doesNotContain: [.matchScalarCaseInsensitiveUnchecked])

// scalar mode -> emit unchecked scalar match only, emit case insensitive
// only if the scalar is cased
expectProgram(
for: "(?i);.'!💖",
semanticLevel: .unicodeScalar,
contains: [.matchScalarUnchecked],
doesNotContain: [.matchScalarCaseInsensitiveUnchecked])
expectProgram(
for: "(?i)abcdé",
semanticLevel: .unicodeScalar,
contains: [.matchScalarCaseInsensitiveUnchecked],
doesNotContain: [.matchScalarUnchecked])
}
}