Skip to content

Commit

Permalink
Merge pull request #21 from cketti/charsequence_support
Browse files Browse the repository at this point in the history
Change `StringExtensions` to `CharSequenceExtensions`
  • Loading branch information
cketti authored Feb 25, 2023
2 parents 99ce70e + b925950 commit 8fa6142
Show file tree
Hide file tree
Showing 9 changed files with 216 additions and 240 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
@file:Suppress(
"INVISIBLE_MEMBER", // Required to be able to use kotlin.internal.HidesMembers
"INVISIBLE_REFERENCE", // Required to be able to use kotlin.internal.HidesMembers
)
package de.cketti.codepoints.deluxe

import de.cketti.codepoints.codePointAt as intCodePointAt
import de.cketti.codepoints.codePointBefore as intCodePointBefore

/**
* Returns the Unicode code point at the specified index.
*
* The `index` parameter is the regular `CharSequence` index, i.e. the number of `Char`s from the start of the character
* sequence.
*
* If the `index` is out of bounds of this character sequence, this method throws an [IndexOutOfBoundsException].
*
* See [codePointAt][intCodePointAt].
* ```
*/
@kotlin.internal.HidesMembers
fun CharSequence.codePointAt(index: Int): CodePoint {
return intCodePointAt(index).toCodePoint()
}

/**
* Returns the Unicode code point before the specified index.
*
* The `index` parameter is the regular `CharSequence` index, i.e. the number of `Char`s from the start of the character
* sequence.
*
* If the value `index - 1` is out of bounds of this character sequence, this method throws an
* [IndexOutOfBoundsException].
*
* See [codePointBefore][intCodePointBefore].
*/
fun CharSequence.codePointBefore(index: Int): CodePoint {
return intCodePointBefore(index).toCodePoint()
}

/**
* Sequence of [CodePoint]s in this character sequence.
*/
fun CharSequence.codePointSequence(): CodePointSequence {
return CodePointSequence(this)
}

/**
* Iterator for [CodePoint]s in this character sequence.
*/
fun CharSequence.codePointIterator(startIndex: Int = 0, endIndex: Int = length): CodePointIterator {
return CodePointIterator(this, startIndex, endIndex)
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,23 @@ package de.cketti.codepoints.deluxe
import kotlin.jvm.JvmInline

/**
* Sequence of [CodePoint]s in the given [String].
* Sequence of [CodePoint]s in the given [CharSequence].
*/
@JvmInline
value class CodePointSequence(private val text: String) : Sequence<CodePoint> {
value class CodePointSequence(private val text: CharSequence) : Sequence<CodePoint> {
override fun iterator(): CodePointIterator {
return text.codePointIterator()
}
}

/**
* Iterator for [CodePoint]s in the given [String].
* Iterator for [CodePoint]s in the given [CharSequence].
*
* The `startIndex` and `endIndex` parameters are the regular `String` indices, i.e. the number of `Char`s from the
* start of the string.
* The `startIndex` and `endIndex` parameters are the regular `CharSequence` indices, i.e. the number of `Char`s from
* the start of the character sequence.
*/
class CodePointIterator(
private val text: String,
private val text: CharSequence,
startIndex: Int,
private val endIndex: Int
) : Iterator<CodePoint> {
Expand Down
50 changes: 0 additions & 50 deletions kotlin-codepoints-deluxe/src/commonMain/kotlin/StringExtensions.kt

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ package de.cketti.codepoints.deluxe
import kotlin.test.assertEquals
import kotlin.test.Test

class StringExtensionsTest {
class CharSequenceExtensionsTest {
@Test
fun codePointAt() {
assertEquals('a'.toCodePoint(), "a".codePointAt(0))
Expand Down

This file was deleted.

155 changes: 155 additions & 0 deletions kotlin-codepoints/src/commonMain/kotlin/CharSequenceExtensions.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
package de.cketti.codepoints

/**
* Returns the Unicode code point at the specified index.
*
* The `index` parameter is the regular `CharSequence` index, i.e. the number of `Char`s from the start of the character
* sequence.
*
* If the code point at the specified index is part of the Basic Multilingual Plane (BMP), its value can be represented
* using a single `Char` and this method will behave exactly like [CharSequence.get].
* Code points outside the BMP are encoded using a surrogate pair – a `Char` containing a value in the high surrogate
* range followed by a `Char` containing a value in the low surrogate range. Together these two `Char`s encode a single
* code point in one of the supplementary planes. This method will do the necessary decoding and return the value of
* that single code point.
*
* In situations where surrogate characters are encountered that don't form a valid surrogate pair starting at `index`,
* this method will return the surrogate code point itself, behaving like [CharSequence.get].
*
* If the `index` is out of bounds of this character sequence, this method throws an [IndexOutOfBoundsException].
*
* To iterate over all code points in a character sequence the index has to be adjusted depending on the value of the
* returned code point. Use [CodePoints.charCount] for this.
*
* ```kotlin
* // Text containing code points outside the BMP (encoded as a surrogate pairs)
* val text = "\uD83E\uDD95\uD83E\uDD96"
*
* var index = 0
* while (index < text.length) {
* val codePoint = text.codePointAt(index)
* // Do something with codePoint
*
* index += CodePoints.charCount(codePoint)
* }
* ```
*/
fun CharSequence.codePointAt(index: Int): Int {
if (index !in indices) throw IndexOutOfBoundsException()

val firstChar = this[index]
if (firstChar.isHighSurrogate() && index + 1 < length) {
val nextChar = this[index + 1]
if (nextChar.isLowSurrogate()) {
return CodePoints.toCodePoint(firstChar, nextChar)
}
}

return firstChar.code
}

/**
* Returns the Unicode code point before the specified index.
*
* The `index` parameter is the regular `CharSequence` index, i.e. the number of `Char`s from the start of the character
* sequence.
*
* If the `Char` value at `index - 1` is in the low surrogate range and the `Char` value at `index - 2` is in the high
* surrogate range, then the surrogate pair is decoded and the code point in one of the supplementary planes is
* returned. In all other cases this method behaves like [CharSequence.get] was called with an argument of `index - 1`.
*
* If the value `index - 1` is out of bounds of this character sequence, this method throws an
* [IndexOutOfBoundsException].
*/
fun CharSequence.codePointBefore(index: Int): Int {
val startIndex = index - 1
if (startIndex !in indices) throw IndexOutOfBoundsException()

val firstChar = this[startIndex]
if (firstChar.isLowSurrogate() && startIndex - 1 >= 0) {
val previousChar = this[startIndex - 1]
if (previousChar.isHighSurrogate()) {
return CodePoints.toCodePoint(previousChar, firstChar)
}
}

return firstChar.code
}

/**
* Returns the number of Unicode code points in the specified text range of this `CharSequence`.
*
* The text range begins at the specified `beginIndex` and extends to the `Char` at index `endIndex - 1`. Thus, the
* length (in `Char`s) of the text range is `endIndex - beginIndex`. Unpaired surrogates within the text range count as
* one code point each.
*
* If `beginIndex` is negative, or `endIndex` is larger than the length of this string, or `beginIndex` is larger than
* `endIndex`, this method throws an [IndexOutOfBoundsException].
*/
fun CharSequence.codePointCount(beginIndex: Int, endIndex: Int): Int {
if (beginIndex < 0 || endIndex > length || beginIndex > endIndex) throw IndexOutOfBoundsException()

var index = beginIndex
var count = 0
do {
val firstChar = this[index]
index++
if (firstChar.isHighSurrogate() && index < endIndex) {
val nextChar = this[index]
if (nextChar.isLowSurrogate()) {
index++
}
}

count++
} while (index < endIndex)

return count
}

/**
* Returns the index within this `CharSequence` that is offset from the given `index` by `codePointOffset` code points.
*
* Unpaired surrogates within the text range given by `index` and `codePointOffset` count as one code point each.
*
* If `index` is negative or larger than the length of this character sequence, or if `codePointOffset` is positive and
* the subsequence starting with `index` has fewer than `codePointOffset` code points, or if `codePointOffset` is
* negative and the subsequence before index has fewer than the absolute value of `codePointOffset` code points, this
* method throws an [IndexOutOfBoundsException].
*/
fun CharSequence.offsetByCodePoints(index: Int, codePointOffset: Int): Int {
if (index !in 0..length) throw IndexOutOfBoundsException()
if (codePointOffset == 0) return index

if (codePointOffset > 0) {
var currentIndex = index
repeat(codePointOffset) {
if (currentIndex > lastIndex) throw IndexOutOfBoundsException()
val firstChar = this[currentIndex]
currentIndex++
if (firstChar.isHighSurrogate() && currentIndex <= lastIndex) {
val nextChar = this[currentIndex]
if (nextChar.isLowSurrogate()) {
currentIndex++
}
}
}

return currentIndex
} else {
var currentIndex = index - 1
repeat(-codePointOffset) {
if (currentIndex < 0) throw IndexOutOfBoundsException()
val firstChar = this[currentIndex]
currentIndex--
if (firstChar.isLowSurrogate() && currentIndex >= 0) {
val previousChar = this[currentIndex]
if (previousChar.isHighSurrogate()) {
currentIndex--
}
}
}

return currentIndex + 1
}
}
Loading

0 comments on commit 8fa6142

Please sign in to comment.