Skip to content

[stdlib] Export grapheme breaking facility #62794

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 29 additions & 38 deletions stdlib/private/StdlibUnicodeUnittest/GraphemeBreaking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,56 +15,47 @@
#if _runtime(_ObjC)
import Foundation

func parseGraphemeBreakTests(
_ data: String,
into result: inout [(String, Int)]
) {
for line in data.split(separator: "\n") {
public struct GraphemeBreakTest {
public let string: String
public let pieces: [[Unicode.Scalar]]

init?(line: some StringProtocol) {
// Only look at actual tests
guard line.hasPrefix("÷") else {
continue
}
guard line.hasPrefix("÷") else { return nil }

let info = line.split(separator: "#")
let components = info[0].split(separator: " ")

var string = ""
var count = 0

for i in components.indices {
guard i != 0 else {
continue
}

let scalar: Unicode.Scalar

// If we're an odd index, this is a scalar.
if i & 0x1 == 1 {
scalar = Unicode.Scalar(UInt32(components[i], radix: 16)!)!

var pieces: [[Unicode.Scalar]] = []

var piece: [Unicode.Scalar] = []
for component in components {
switch component {
case "":
break
case "×": // no grapheme break opportunity
break
case "÷": // grapheme break opportunity
guard !piece.isEmpty else { break }
pieces.append(piece)
piece = []
case _: // hexadecimal scalar value
guard let value = UInt32(component, radix: 16) else { return nil }
guard let scalar = Unicode.Scalar(value) else { return nil }
string.unicodeScalars.append(scalar)
} else {
// Otherwise, it is a grapheme breaking operator.

// If this is a break, record the +1 count. Otherwise it is × which is
// not a break.
if components[i] == "÷" {
count += 1
}
piece.append(scalar)
}
}

result.append((string, count))
if !piece.isEmpty { pieces.append(piece) }
self.string = string
self.pieces = pieces
}
}

public let graphemeBreakTests: [(String, Int)] = {
var result: [(String, Int)] = []

public let graphemeBreakTests: [GraphemeBreakTest] = {
let testFile = readInputFile("GraphemeBreakTest.txt")

parseGraphemeBreakTests(testFile, into: &result)

return result
return testFile.split(separator: "\n")
.compactMap { GraphemeBreakTest(line: $0) }
}()
#endif
147 changes: 129 additions & 18 deletions stdlib/public/core/StringGraphemeBreaking.swift
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
Expand Down Expand Up @@ -436,6 +436,117 @@ internal struct _GraphemeBreakingState {
var shouldBreakRI = false
}

extension Unicode {
/// A state machine for recognizing character (i.e., extended grapheme
/// cluster) boundaries in an arbitrary series of Unicode scalars.
///
/// To detect grapheme breaks in a sequence of Unicode scalars, feed each of
/// them to the `hasBreak(before:)` method. The method returns true if the
/// sequence has a grapheme break preceding the given value.
///
/// The results produced by this state machine are guaranteed to match the way
/// `String` splits its contents into `Character` values.
@available(SwiftStdlib 5.8, *)
public // SPI(Foundation) FIXME: We need API for this
struct _CharacterRecognizer {
internal var _previous: Unicode.Scalar
internal var _state: _GraphemeBreakingState

/// Returns a non-nil value if it can be determined whether there is a
/// grapheme break between `scalar1` and `scalar2` without knowing anything
/// about the scalars that precede `scalar1`. This can optionally be used as
/// a fast (but incomplete) test before spinning up a full state machine
/// session.
@_effects(releasenone)
public static func quickBreak(
between scalar1: Unicode.Scalar,
and scalar2: Unicode.Scalar
) -> Bool? {
if scalar1.value == 0xD, scalar2.value == 0xA {
return false
}
if _hasGraphemeBreakBetween(scalar1, scalar2) {
return true
}
return nil
}

/// Initialize a new character recognizer at the _start of text_ (sot)
/// position.
///
/// The resulting state machine will report a grapheme break on the
/// first scalar that is fed to it.
public init() {
_state = _GraphemeBreakingState()
// To avoid having to handle the empty case specially, we use NUL as the
// placeholder before the first scalar. NUL is a control character, so per
// rule GB5, it will induce an unconditional grapheme break before the
// first actual scalar, emulating GB1.
_previous = Unicode.Scalar(0 as UInt8)
}

/// Feeds the next scalar to the state machine, returning a Boolean value
/// indicating whether it starts a new extended grapheme cluster.
///
/// This method will always report a break the first time it is called
/// on a newly initialized recognizer.
///
/// The state machine does not carry information across character
/// boundaries. I.e., if this method returns true, then `self` after the
/// call is equivalent to feeding the same scalar to a newly initialized
/// recognizer instance.
@_effects(releasenone)
public mutating func hasBreak(
before next: Unicode.Scalar
) -> Bool {
let r = _state.shouldBreak(between: _previous, and: next)
if r {
_state = _GraphemeBreakingState()
}
_previous = next
return r
}

/// Decode the scalars in the given UTF-8 buffer and feed them to the
/// recognizer up to and including the scalar following the first grapheme
/// break. If the buffer contains a grapheme break, then this function
/// returns the index range of the scalar that follows the first one;
/// otherwise it returns `nil`.
///
/// On return, the state of the recognizer is updated to reflect the scalars
/// up to and including the returned one. You can detect additional grapheme
/// breaks by feeding the recognizer subsequent data.
///
/// - Parameter buffer: A buffer containing valid UTF-8 data, starting and
/// ending on Unicode scalar boundaries.
///
/// - Parameter start: A valid index into `buffer`, addressing the first
/// code unit of a UTF-8 scalar in the buffer, or the end.
///
/// - Returns: The index range of the scalar that follows the first grapheme
/// break in the buffer, if there is one. If the buffer contains no
/// grapheme breaks, then this function returns `nil`.
///
/// - Warning: This function does not validate that the buffer contains
/// valid UTF-8 data; its behavior is undefined if given invalid input.
@_effects(releasenone)
public mutating func _firstBreak(
inUncheckedUnsafeUTF8Buffer buffer: UnsafeBufferPointer<UInt8>,
startingAt start: Int = 0
) -> Range<Int>? {
var i = start
while i < buffer.endIndex {
let (next, n) = _decodeScalar(buffer, startingAt: i)
if hasBreak(before: next) {
return Range(_uncheckedBounds: (i, i &+ n))
}
i &+= n
}
return nil
}
}
}

extension _StringGuts {
// Returns the stride of the grapheme cluster starting at offset `index`,
// assuming it is on a grapheme cluster boundary.
Expand All @@ -459,7 +570,7 @@ extension _StringGuts {

while true {
guard let (scalar2, nextIndex) = nextScalar(index) else { break }
if shouldBreak(between: scalar, and: scalar2, at: index, with: &state) {
if state.shouldBreak(between: scalar, and: scalar2) {
break
}
index = nextIndex
Expand Down Expand Up @@ -505,7 +616,7 @@ extension _StringGuts {
}
}

extension _StringGuts {
extension _GraphemeBreakingState {
// Return true if there is an extended grapheme cluster boundary between two
// scalars, based on state information previously collected about preceding
// scalars.
Expand All @@ -517,11 +628,9 @@ extension _StringGuts {
//
// This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
internal func shouldBreak(
internal mutating func shouldBreak(
between scalar1: Unicode.Scalar,
and scalar2: Unicode.Scalar,
at index: Int,
with state: inout _GraphemeBreakingState
and scalar2: Unicode.Scalar
) -> Bool {
// GB3
if scalar1.value == 0xD, scalar2.value == 0xA {
Expand All @@ -545,8 +654,8 @@ extension _StringGuts {
var enterIndicSequence = false

defer {
state.isInEmojiSequence = enterEmojiSequence
state.isInIndicSequence = enterIndicSequence
self.isInEmojiSequence = enterEmojiSequence
self.isInIndicSequence = enterIndicSequence
}

switch (x, y) {
Expand Down Expand Up @@ -591,14 +700,14 @@ extension _StringGuts {
// continue the grapheme cluster by combining more scalars later. If we're
// not currently in an emoji sequence, but our lhs scalar is a pictograph,
// then that's a signal that it's the start of an emoji sequence.
if state.isInEmojiSequence || x == .extendedPictographic {
if self.isInEmojiSequence || x == .extendedPictographic {
enterEmojiSequence = true
}

// If we're currently in an indic sequence (or if our lhs is a linking
// consonant), then this check and everything underneath ensures that
// we continue being in one and may check if this extend is a Virama.
if state.isInIndicSequence || scalar1._isLinkingConsonant {
if self.isInIndicSequence || scalar1._isLinkingConsonant {
if y == .extend {
let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)

Expand All @@ -611,7 +720,7 @@ extension _StringGuts {
enterIndicSequence = true

if scalar2._isVirama {
state.hasSeenVirama = true
self.hasSeenVirama = true
}
}

Expand All @@ -627,32 +736,34 @@ extension _StringGuts {

// GB11
case (.zwj, .extendedPictographic):
return !state.isInEmojiSequence
return !self.isInEmojiSequence

// GB12 & GB13
case (.regionalIndicator, .regionalIndicator):
defer {
state.shouldBreakRI.toggle()
self.shouldBreakRI.toggle()
}

return state.shouldBreakRI
return self.shouldBreakRI

// GB999
default:
// GB9c
if
state.isInIndicSequence,
state.hasSeenVirama,
self.isInIndicSequence,
self.hasSeenVirama,
scalar2._isLinkingConsonant
{
state.hasSeenVirama = false
self.hasSeenVirama = false
return false
}

return true
}
}
}

extension _StringGuts {
// Return true if there is an extended grapheme cluster boundary between two
// scalars, with no previous knowledge about preceding scalars.
//
Expand Down
Loading