Handle boundaries when matching in substrings (#675)

* Handle boundaries when matching in substrings Some of our existing matching routines use the start/endIndex of the input, which is basically never the right thing to do. This change revises those checks to use the search bounds, by either moving the boundary check out of the matching method, or if the boundary is a part of what needs to be matched (e.g. word boundaries have different behavior at the start/end than in the middle of a string) the search bounds are passed into the matching method. Testing is currently handled by piggy-backing on the existing match tests; we should add more tests to handle substring- specific edge cases. * Handle sub-character substring boundaries This change passes the end boundary down into matching methods, and uses it to find the actual character that is part of the input substring, even if the substring's end boundary is in the middle of a grapheme cluster. Substrings cannot have sub-Unicode scalar boundaries as of Swift 5.7; we can remove a check for this when matching an individual scalar.
swiftlang · milseman · Dec 15, 2023 · Oct 6, 2022 · Dec 2, 2022 · Dec 5, 2022
commit f5b0b5ee7654be5e7383b3e3bcc1daa5557751a9
diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift
@@ -68,14 +68,16 @@ extension Character {
     switch opts.semanticLevel {
     case .graphemeCluster:
       return { input, bounds in
-        let low = bounds.lowerBound
+        guard let (char, next) = input.characterAndEnd(
+          at: bounds.lowerBound, limitedBy: bounds.upperBound)
+        else { return nil }
         if isCaseInsensitive && isCased {
-          return input[low].lowercased() == lowercased()
-            ? input.index(after: low)
+          return char.lowercased() == self.lowercased()
+            ? next
             : nil
         } else {
-          return input[low] == self
-            ? input.index(after: low)
+          return char == self
+            ? next
             : nil
         }
       }
@@ -188,7 +190,7 @@ extension DSLTree.Atom.CharacterClass {
   func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction {
     let model = asRuntimeModel(opts)
     return { input, bounds in
-      model.matches(in: input, at: bounds.lowerBound)
+      model.matches(in: input, at: bounds.lowerBound, limitedBy: bounds.upperBound)
     }
   }
 }
@@ -517,7 +519,7 @@ extension DSLTree.CustomCharacterClass {
       }
       if isInverted {
         return opts.semanticLevel == .graphemeCluster
-          ? input.index(after: bounds.lowerBound)
+          ? Swift.min(input.index(after: bounds.lowerBound), bounds.upperBound)
           : input.unicodeScalars.index(after: bounds.lowerBound)
       }
       return nil

diff --git a/Sources/_StringProcessing/Engine/MEBuiltins.swift b/Sources/_StringProcessing/Engine/MEBuiltins.swift
@@ -15,9 +15,10 @@ extension Processor {
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> Bool {
-    guard let next = input.matchBuiltinCC(
+    guard currentPosition < end, let next = input.matchBuiltinCC(
       cc,
       at: currentPosition,
+      limitedBy: end,
       isInverted: isInverted,
       isStrictASCII: isStrictASCII,
       isScalarSemantics: isScalarSemantics
@@ -102,56 +103,96 @@ extension Processor {
 
     case .wordBoundary:
       if payload.usesSimpleUnicodeBoundaries {
-        // TODO: How should we handle bounds?
         return atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel)
       } else {
-        return input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex)
+        return input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex)
       }
 
     case .notWordBoundary:
       if payload.usesSimpleUnicodeBoundaries {
-        // TODO: How should we handle bounds?
         return !atSimpleBoundary(payload.usesASCIIWord, payload.semanticLevel)
       } else {
-        return !input.isOnWordBoundary(at: currentPosition, using: &wordIndexCache, &wordIndexMaxIndex)
+        return !input.isOnWordBoundary(at: currentPosition, in: searchBounds, using: &wordIndexCache, &wordIndexMaxIndex)
       }
     }
   }
 }
 
 // MARK: Matching `.`
 extension String {
-  // TODO: Should the below have a `limitedBy` parameter?
+  /// Returns the character at `pos`, bounded by `end`, as well as the upper
+  /// boundary of the returned character.
+  ///
+  /// This function handles loading a character from a string while respecting
+  /// an end boundary, even if that end boundary is sub-character or sub-scalar.
+  ///
+  ///   - If `pos` is at or past `end`, this function returns `nil`.
+  ///   - If `end` is between `pos` and the next grapheme cluster boundary (i.e.,
+  ///     `end` is before `self.index(after: pos)`, then the returned character
+  ///     is smaller than the one that would be produced by `self[pos]` and the
+  ///     returned index is at the end of that character.
+  ///   - If `end` is between `pos` and the next grapheme cluster boundary, and
+  ///     is not on a Unicode scalar boundary, the partial scalar is dropped. This
+  ///     can result in a `nil` return or a character that includes only part of
+  ///     the `self[pos]` character.
+  ///
+  /// - Parameters:
+  ///   - pos: The position to load a character from.
+  ///   - end: The limit for the character at `pos`.
+  /// - Returns: The character at `pos`, bounded by `end`, if it exists, along
+  ///   with the upper bound of that character. The upper bound is always
+  ///   scalar-aligned.
+  func characterAndEnd(at pos: String.Index, limitedBy end: String.Index) -> (Character, String.Index)? {
+    // FIXME: Sink into the stdlib to avoid multiple boundary calculations
+    guard pos < end else { return nil }
+    let next = index(after: pos)
+    if next <= end {
+      return (self[pos], next)
+    }
 
+    // `end` must be a sub-character position that is between `pos` and the
+    // next grapheme boundary. This is okay if `end` is on a Unicode scalar
+    // boundary, but if it's in the middle of a scalar's code units, there
+    // may not be a character to return at all after rounding down. Use
+    // `Substring`'s rounding to determine what we can return.
+    let substr = self[pos..<end]
+    return substr.isEmpty
+      ? nil
+      : (substr.first!, substr.endIndex)
+  }
+
   func matchAnyNonNewline(
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    guard currentPosition < endIndex else {
-      return nil
-    }
+    guard currentPosition < end else { return nil }
     if case .definite(let result) = _quickMatchAnyNonNewline(
       at: currentPosition,
+      limitedBy: end,
       isScalarSemantics: isScalarSemantics
     ) {
       assert(result == _thoroughMatchAnyNonNewline(
         at: currentPosition,
+        limitedBy: end,
         isScalarSemantics: isScalarSemantics))
       return result
     }
     return _thoroughMatchAnyNonNewline(
       at: currentPosition,
+      limitedBy: end,
       isScalarSemantics: isScalarSemantics)
   }
 
   @inline(__always)
   private func _quickMatchAnyNonNewline(
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isScalarSemantics: Bool
   ) -> QuickResult<String.Index?> {
-    assert(currentPosition < endIndex)
+    assert(currentPosition < end)
     guard let (asciiValue, next, isCRLF) = _quickASCIICharacter(
-      at: currentPosition
+      at: currentPosition, limitedBy: end
     ) else {
       return .unknown
     }
@@ -167,46 +208,47 @@ extension String {
   @inline(never)
   private func _thoroughMatchAnyNonNewline(
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    assert(currentPosition < endIndex)
     if isScalarSemantics {
+      guard currentPosition < end else { return nil }
       let scalar = unicodeScalars[currentPosition]
       guard !scalar.isNewline else { return nil }
       return unicodeScalars.index(after: currentPosition)
     }
 
-    let char = self[currentPosition]
-    guard !char.isNewline else { return nil }
-    return index(after: currentPosition)
+    guard let (char, next) = characterAndEnd(at: currentPosition, limitedBy: end),
+          !char.isNewline
+    else { return nil }
+    return next
   }
 }
 
 // MARK: - Built-in character class matching
 extension String {
-  // TODO: Should the below have a `limitedBy` parameter?
-
   // Mentioned in ProgrammersManual.md, update docs if redesigned
   func matchBuiltinCC(
     _ cc: _CharacterClassModel.Representation,
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isInverted: Bool,
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    guard currentPosition < endIndex else {
-      return nil
-    }
+    guard currentPosition < end else { return nil }
     if case .definite(let result) = _quickMatchBuiltinCC(
       cc,
       at: currentPosition,
+      limitedBy: end,
       isInverted: isInverted,
       isStrictASCII: isStrictASCII,
       isScalarSemantics: isScalarSemantics
     ) {
       assert(result == _thoroughMatchBuiltinCC(
         cc,
         at: currentPosition,
+        limitedBy: end,
         isInverted: isInverted,
         isStrictASCII: isStrictASCII,
         isScalarSemantics: isScalarSemantics))
@@ -215,6 +257,7 @@ extension String {
     return _thoroughMatchBuiltinCC(
       cc,
       at: currentPosition,
+      limitedBy: end,
       isInverted: isInverted,
       isStrictASCII: isStrictASCII,
       isScalarSemantics: isScalarSemantics)
@@ -225,13 +268,17 @@ extension String {
   private func _quickMatchBuiltinCC(
     _ cc: _CharacterClassModel.Representation,
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isInverted: Bool,
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> QuickResult<String.Index?> {
-    assert(currentPosition < endIndex)
+    assert(currentPosition < end)
     guard let (next, result) = _quickMatch(
-      cc, at: currentPosition, isScalarSemantics: isScalarSemantics
+      cc,
+      at: currentPosition,
+      limitedBy: end,
+      isScalarSemantics: isScalarSemantics
     ) else {
       return .unknown
     }
@@ -243,27 +290,25 @@ extension String {
   private func _thoroughMatchBuiltinCC(
     _ cc: _CharacterClassModel.Representation,
     at currentPosition: String.Index,
+    limitedBy end: String.Index,
     isInverted: Bool,
     isStrictASCII: Bool,
     isScalarSemantics: Bool
   ) -> String.Index? {
-    assert(currentPosition < endIndex)
-    let char = self[currentPosition]
+    // TODO: Branch here on scalar semantics
+    // Don't want to pay character cost if unnecessary
+    guard var (char, next) =
+            characterAndEnd(at: currentPosition, limitedBy: end)
+    else { return nil }
     let scalar = unicodeScalars[currentPosition]
 
     let asciiCheck = !isStrictASCII
     || (scalar.isASCII && isScalarSemantics)
     || char.isASCII
 
     var matched: Bool
-    var next: String.Index
-    switch (isScalarSemantics, cc) {
-    case (_, .anyGrapheme):
-      next = index(after: currentPosition)
-    case (true, _):
+    if isScalarSemantics && cc != .anyGrapheme {
       next = unicodeScalars.index(after: currentPosition)
-    case (false, _):
-      next = index(after: currentPosition)
     }
 
     switch cc {
@@ -291,7 +336,7 @@ extension String {
       if isScalarSemantics {
         matched = scalar.isNewline && asciiCheck
         if matched && scalar == "\r"
-            && next != endIndex && unicodeScalars[next] == "\n" {
+            && next < end && unicodeScalars[next] == "\n" {
           // Match a full CR-LF sequence even in scalar semantics
           unicodeScalars.formIndex(after: &next)
         }

diff --git a/Sources/_StringProcessing/Engine/MEQuantify.swift b/Sources/_StringProcessing/Engine/MEQuantify.swift
@@ -17,18 +17,18 @@ extension Processor {
         boundaryCheck: !isScalarSemantics,
         isCaseInsensitive: false)
     case .builtin:
-      // FIXME: bounds check? endIndex or end?
+      guard currentPosition < end else { return nil }
 
       // We only emit .quantify if it consumes a single character
       return input.matchBuiltinCC(
         payload.builtin,
         at: currentPosition,
+        limitedBy: end,
         isInverted: payload.builtinIsInverted,
         isStrictASCII: payload.builtinIsStrict,
         isScalarSemantics: isScalarSemantics)
     case .any:
-      // FIXME: endIndex or end?
-      guard currentPosition < input.endIndex else { return nil }
+      guard currentPosition < end else { return nil }
 
       if payload.anyMatchesNewline {
         if isScalarSemantics {
@@ -38,7 +38,9 @@ extension Processor {
       }
 
       return input.matchAnyNonNewline(
-        at: currentPosition, isScalarSemantics: isScalarSemantics)
+        at: currentPosition,
+        limitedBy: end,
+        isScalarSemantics: isScalarSemantics)
     }
   }