Skip to content

Make functions to read and write code points public #308

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions core/api/kotlinx-io-core.api
Original file line number Diff line number Diff line change
Expand Up @@ -187,12 +187,14 @@ public final class kotlinx/io/SourcesKt {
}

public final class kotlinx/io/Utf8Kt {
public static final fun readCodePointValue (Lkotlinx/io/Source;)I
public static final fun readLine (Lkotlinx/io/Source;)Ljava/lang/String;
public static final fun readLineStrict (Lkotlinx/io/Source;J)Ljava/lang/String;
public static synthetic fun readLineStrict$default (Lkotlinx/io/Source;JILjava/lang/Object;)Ljava/lang/String;
public static final fun readString (Lkotlinx/io/Buffer;)Ljava/lang/String;
public static final fun readString (Lkotlinx/io/Source;)Ljava/lang/String;
public static final fun readString (Lkotlinx/io/Source;J)Ljava/lang/String;
public static final fun writeCodePointValue (Lkotlinx/io/Sink;I)V
public static final fun writeString (Lkotlinx/io/Sink;Ljava/lang/String;II)V
public static synthetic fun writeString$default (Lkotlinx/io/Sink;Ljava/lang/String;IIILjava/lang/Object;)V
}
Expand Down
2 changes: 2 additions & 0 deletions core/api/kotlinx-io-core.klib.api
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ final fun (kotlinx.io/Buffer).kotlinx.io/snapshot(): kotlinx.io.bytestring/ByteS
final fun (kotlinx.io/RawSink).kotlinx.io/buffered(): kotlinx.io/Sink // kotlinx.io/buffered|buffered@kotlinx.io.RawSink(){}[0]
final fun (kotlinx.io/RawSource).kotlinx.io/buffered(): kotlinx.io/Source // kotlinx.io/buffered|buffered@kotlinx.io.RawSource(){}[0]
final fun (kotlinx.io/Sink).kotlinx.io/write(kotlinx.io.bytestring/ByteString, kotlin/Int =..., kotlin/Int =...) // kotlinx.io/write|write@kotlinx.io.Sink(kotlinx.io.bytestring.ByteString;kotlin.Int;kotlin.Int){}[0]
final fun (kotlinx.io/Sink).kotlinx.io/writeCodePointValue(kotlin/Int) // kotlinx.io/writeCodePointValue|writeCodePointValue@kotlinx.io.Sink(kotlin.Int){}[0]
final fun (kotlinx.io/Sink).kotlinx.io/writeDecimalLong(kotlin/Long) // kotlinx.io/writeDecimalLong|writeDecimalLong@kotlinx.io.Sink(kotlin.Long){}[0]
final fun (kotlinx.io/Sink).kotlinx.io/writeDouble(kotlin/Double) // kotlinx.io/writeDouble|writeDouble@kotlinx.io.Sink(kotlin.Double){}[0]
final fun (kotlinx.io/Sink).kotlinx.io/writeDoubleLe(kotlin/Double) // kotlinx.io/writeDoubleLe|writeDoubleLe@kotlinx.io.Sink(kotlin.Double){}[0]
Expand All @@ -105,6 +106,7 @@ final fun (kotlinx.io/Source).kotlinx.io/readByteArray(): kotlin/ByteArray // ko
final fun (kotlinx.io/Source).kotlinx.io/readByteArray(kotlin/Int): kotlin/ByteArray // kotlinx.io/readByteArray|readByteArray@kotlinx.io.Source(kotlin.Int){}[0]
final fun (kotlinx.io/Source).kotlinx.io/readByteString(): kotlinx.io.bytestring/ByteString // kotlinx.io/readByteString|readByteString@kotlinx.io.Source(){}[0]
final fun (kotlinx.io/Source).kotlinx.io/readByteString(kotlin/Int): kotlinx.io.bytestring/ByteString // kotlinx.io/readByteString|readByteString@kotlinx.io.Source(kotlin.Int){}[0]
final fun (kotlinx.io/Source).kotlinx.io/readCodePointValue(): kotlin/Int // kotlinx.io/readCodePointValue|readCodePointValue@kotlinx.io.Source(){}[0]
final fun (kotlinx.io/Source).kotlinx.io/readDecimalLong(): kotlin/Long // kotlinx.io/readDecimalLong|readDecimalLong@kotlinx.io.Source(){}[0]
final fun (kotlinx.io/Source).kotlinx.io/readDouble(): kotlin/Double // kotlinx.io/readDouble|readDouble@kotlinx.io.Source(){}[0]
final fun (kotlinx.io/Source).kotlinx.io/readDoubleLe(): kotlin/Double // kotlinx.io/readDoubleLe|readDoubleLe@kotlinx.io.Source(){}[0]
Expand Down
41 changes: 25 additions & 16 deletions core/common/src/Utf8.kt
Original file line number Diff line number Diff line change
Expand Up @@ -122,14 +122,23 @@ internal fun String.utf8Size(startIndex: Int = 0, endIndex: Int = length): Long
/**
* Encodes [codePoint] in UTF-8 and writes it to this sink.
*
* Note that in general, a value retrieved from [Char.code] could not be written directly
* as it may be a part of a [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2) (that could be
* detected using [Char.isSurrogate], or [Char.isHighSurrogate] and [Char.isLowSurrogate]).
* Such a pair of characters needs to be manually converted back to a single code point
* which then could be written to a [Sink].
* Without such a conversion, data written to a [Sink] can not be converted back
* to a string from which a surrogate pair was retrieved.
*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The documentation does not say anything about what will be written if we pass a code point with the surrogate value (U+d800..U+dfff).
E.g. on the JVM will be written single byte with value 63 (?)

Also we should add test on it

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, we always writing '?'. Utf8Test::writeSurrogateCodePoint covers that scenario.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But this is not mentioned in the KDoc

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for pointing to that! Opened #314

* @param codePoint the codePoint to be written.
*
* @throws IllegalStateException when the sink is closed.
*
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.utf8CodePointSample
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeUtf8CodePointSample
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeSurrogatePair
*/
@OptIn(DelicateIoApi::class)
internal fun Sink.writeUtf8CodePoint(codePoint: Int): Unit =
public fun Sink.writeCodePointValue(codePoint: Int): Unit =
writeToInternalBuffer { it.commonWriteUtf8CodePoint(codePoint) }
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it handle negative values of codePoint?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not. Opened #317


/**
Expand Down Expand Up @@ -196,24 +205,31 @@ public fun Source.readString(byteCount: Long): String {
}

/**
* Removes and returns a single UTF-8 code point, reading between 1 and 4 bytes as necessary.
* Decodes a single code point value from UTF-8 code units, reading between 1 and 4 bytes as necessary.
*
* If this source is exhausted before a complete code point can be read, this throws an
* [EOFException] and consumes no input.
*
* If this source doesn't start with a properly-encoded UTF-8 code point, this method will remove
* 1 or more non-UTF-8 bytes and return the replacement character (`U+fffd`). This covers encoding
* problems (the input is not properly-encoded UTF-8), characters out of range (beyond the
* `0x10ffff` limit of Unicode), code points for UTF-16 surrogates (`U+d800`..`U+dfff`) and overlong
* encodings (such as `0xc080` for the NUL character in modified UTF-8).
* If this source starts with an ill-formed UTF-8 code units sequence, this method will remove
* 1 or more non-UTF-8 bytes and return the replacement character (`U+fffd`).
*
* The replacement character (`U+fffd`) will be also returned if the source starts with a well-formed
* code units sequences, but a decoded value does not pass further validation, such as
* the value is out of range (beyond the `0x10ffff` limit of Unicode), maps to UTF-16 surrogates (`U+d800`..`U+dfff`),
* or an overlong encoding is detected (such as `0xc080` for the NUL character in modified UTF-8).
*
* Note that in general, returned value may not be directly converted to [Char] as it may be out
* of [Char]'s values range and should be manually converted to a
* [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2).
*
* @throws EOFException when the source is exhausted before a complete code point can be read.
* @throws IllegalStateException when the source is closed.
*
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.readUtf8CodePointSample
* @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.surrogatePairs
*/
@OptIn(InternalIoApi::class)
internal fun Source.readUtf8CodePoint(): Int {
public fun Source.readCodePointValue(): Int {
require(1)

val b0 = buffer[0].toInt()
Expand All @@ -226,13 +242,6 @@ internal fun Source.readUtf8CodePoint(): Int {
return buffer.commonReadUtf8CodePoint()
}

/**
* @see Source.readUtf8CodePoint
*/
internal fun Buffer.readUtf8CodePoint(): Int {
return this.commonReadUtf8CodePoint()
}

/**
* Removes and returns UTF-8 encoded characters up to but not including the next line break. A line break is
* either `"\n"` or `"\r\n"`; these characters are not included in the result.
Expand Down
14 changes: 7 additions & 7 deletions core/common/test/AbstractSourceTest.kt
Original file line number Diff line number Diff line change
Expand Up @@ -1099,25 +1099,25 @@ abstract class AbstractBufferedSourceTest internal constructor(
with(sink) {
writeByte(0x7f)
emit()
assertEquals(0x7f, source.readUtf8CodePoint().toLong())
assertEquals(0x7f, source.readCodePointValue().toLong())

writeByte(0xdf.toByte())
writeByte(0xbf.toByte())
emit()
assertEquals(0x07ff, source.readUtf8CodePoint().toLong())
assertEquals(0x07ff, source.readCodePointValue().toLong())

writeByte(0xef.toByte())
writeByte(0xbf.toByte())
writeByte(0xbf.toByte())
emit()
assertEquals(0xffff, source.readUtf8CodePoint().toLong())
assertEquals(0xffff, source.readCodePointValue().toLong())

writeByte(0xf4.toByte())
writeByte(0x8f.toByte())
writeByte(0xbf.toByte())
writeByte(0xbf.toByte())
emit()
assertEquals(0x10ffff, source.readUtf8CodePoint().toLong())
assertEquals(0x10ffff, source.readCodePointValue().toLong())
}
}

Expand All @@ -1126,20 +1126,20 @@ abstract class AbstractBufferedSourceTest internal constructor(
with(sink) {
writeByte(0xdf.toByte()) // a second byte is missing
emit()
assertFailsWith<EOFException> { source.readUtf8CodePoint() }
assertFailsWith<EOFException> { source.readCodePointValue() }
assertEquals(1, source.readByteArray().size)

writeByte(0xe2.toByte())
writeByte(0x98.toByte()) // a third byte is missing
emit()
assertFailsWith<EOFException> { source.readUtf8CodePoint() }
assertFailsWith<EOFException> { source.readCodePointValue() }
assertEquals(2, source.readByteArray().size)

writeByte(0xf0.toByte())
writeByte(0x9f.toByte())
writeByte(0x92.toByte()) // a forth byte is missing
emit()
assertFailsWith<EOFException> { source.readUtf8CodePoint() }
assertFailsWith<EOFException> { source.readCodePointValue() }
assertEquals(3, source.readByteArray().size)
}
}
Expand Down
46 changes: 23 additions & 23 deletions core/common/test/Utf8Test.kt
Original file line number Diff line number Diff line change
Expand Up @@ -285,22 +285,22 @@ class Utf8Test {
@Test
fun readCodePointFromEmptyBufferThrowsEofException() {
val buffer = Buffer()
assertFailsWith<EOFException> { buffer.readUtf8CodePoint() }
assertFailsWith<EOFException> { buffer.readCodePointValue() }
}

@Test
fun readLeadingContinuationByteReturnsReplacementCharacter() {
val buffer = Buffer()
buffer.writeByte(0xbf.toByte())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertTrue(buffer.exhausted())
}

@Test
fun readMissingContinuationBytesThrowsEofException() {
val buffer = Buffer()
buffer.writeByte(0xdf.toByte())
assertFailsWith<EOFException> { buffer.readUtf8CodePoint() }
assertFailsWith<EOFException> { buffer.readCodePointValue() }
assertFalse(buffer.exhausted()) // Prefix byte wasn't consumed.
}

Expand All @@ -309,21 +309,21 @@ class Utf8Test {
// 5-byte and 6-byte code points are not supported.
val buffer = Buffer()
buffer.write("f888808080".decodeHex())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertTrue(buffer.exhausted())

buffer.write(ByteArray(Segment.SIZE - 2))
buffer.write("f888808080".decodeHex())
buffer.skip(Segment.SIZE - 2L)
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertTrue(buffer.exhausted())
}

Expand All @@ -332,8 +332,8 @@ class Utf8Test {
// Use a non-continuation byte where a continuation byte is expected.
val buffer = Buffer()
buffer.write("df20".decodeHex())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(0x20, buffer.readUtf8CodePoint()) // Non-continuation character not consumed.
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertEquals(0x20, buffer.readCodePointValue()) // Non-continuation character not consumed.
assertTrue(buffer.exhausted())
}

Expand All @@ -342,18 +342,18 @@ class Utf8Test {
// A 4-byte encoding with data above the U+10ffff Unicode maximum.
val buffer = Buffer()
buffer.write("f4908080".decodeHex())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertTrue(buffer.exhausted())
}

@Test
fun readSurrogateCodePoint() {
val buffer = Buffer()
buffer.write("eda080".decodeHex())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertTrue(buffer.exhausted())
buffer.write("edbfbf".decodeHex())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertTrue(buffer.exhausted())
}

Expand All @@ -362,15 +362,15 @@ class Utf8Test {
// Use 2 bytes to encode data that only needs 1 byte.
val buffer = Buffer()
buffer.write("c080".decodeHex())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
assertTrue(buffer.exhausted())
}

@Test
fun writeCodePointBeyondUnicodeMaximum() {
val buffer = Buffer()
assertFailsWith<IllegalArgumentException>("Unexpected code point: 0x110000") {
buffer.writeUtf8CodePoint(0x110000)
buffer.writeCodePointValue(0x110000)
}
}

Expand Down Expand Up @@ -428,7 +428,7 @@ class Utf8Test {

private fun Buffer.assertCodePointEncoded(expectedHex: String, codePoint: Int, prefixLength: Int = 0) {
write(ByteArray(prefixLength))
writeUtf8CodePoint(codePoint)
writeCodePointValue(codePoint)
skip(prefixLength.toLong())
assertArrayEquals(expectedHex.decodeHex(), readByteArray())
}
Expand All @@ -437,7 +437,7 @@ class Utf8Test {
write(ByteArray(prefixLength))
write(hex.decodeHex())
skip(prefixLength.toLong())
assertEquals(expectedCodePoint, readUtf8CodePoint())
assertEquals(expectedCodePoint, readCodePointValue())
}

private fun Buffer.assertUtf8StringEncoded(expectedHex: String, string: String, prefixLength: Int = 0) {
Expand Down Expand Up @@ -469,7 +469,7 @@ class Utf8Test {
val bufferUtf8 = Buffer()
for (charIdx in string.indices) {
val c = string[charIdx]
bufferUtf8.writeUtf8CodePoint(c.code)
bufferUtf8.writeCodePointValue(c.code)
}
assertArrayEquals(expectedUtf8, bufferUtf8.readByteArray())

Expand Down
73 changes: 69 additions & 4 deletions core/common/test/samples/samples.kt
Original file line number Diff line number Diff line change
Expand Up @@ -100,19 +100,84 @@ class KotlinxIoCoreCommonSamples {
fun writeUtf8CodePointSample() {
val buffer = Buffer()

buffer.writeInt('Δ'.code) // writes integer value as is
assertContentEquals(byteArrayOf(0, 0, 0x3, 0x94.toByte()), buffer.readByteArray())
// Basic Latin (a.k.a. ASCII) characters are encoded with a single byte
buffer.writeCodePointValue('Y'.code)
assertContentEquals(byteArrayOf(0x59), buffer.readByteArray())

buffer.writeUtf8CodePoint('Δ'.code) // encodes code point using UTF-8 encoding
// wider characters are encoded into multiple UTF-8 code units
buffer.writeCodePointValue('Δ'.code)
assertContentEquals(byteArrayOf(0xce.toByte(), 0x94.toByte()), buffer.readByteArray())

// note the difference: writeInt won't encode the code point, like writeCodePointValue did
buffer.writeInt('Δ'.code)
assertContentEquals(byteArrayOf(0, 0, 0x3, 0x94.toByte()), buffer.readByteArray())
}

@Test
fun writeSurrogatePair() {
val buffer = Buffer()

// U+1F31E (a.k.a. "sun with face") is too wide to fit in a single UTF-16 character,
// so it's represented using a surrogate pair.
val chars = "🌞".toCharArray()
assertEquals(2, chars.size)

// such a pair has to be manually converted to a single code point
assertTrue(chars[0].isHighSurrogate())
assertTrue(chars[1].isLowSurrogate())

val highSurrogate = chars[0].code
val lowSurrogate = chars[1].code

// see https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF for details
val codePoint = 0x10000 + (highSurrogate - 0xD800).shl(10).or(lowSurrogate - 0xDC00)
assertEquals(0x1F31E, codePoint)

// now we can write the code point
buffer.writeCodePointValue(codePoint)
// and read the correct string back
assertEquals("🌞", buffer.readString())

// we won't achieve that by writing surrogates as it is
buffer.apply {
writeCodePointValue(highSurrogate)
writeCodePointValue(lowSurrogate)
}
assertNotEquals("🌞", buffer.readString())
}

@Test
fun readUtf8CodePointSample() {
val buffer = Buffer()

buffer.writeUShort(0xce94U)
assertEquals(0x394, buffer.readUtf8CodePoint()) // decodes single UTF-8 encoded code point
assertEquals(0x394, buffer.readCodePointValue()) // decodes a single UTF-8 encoded code point
}

@Test
fun surrogatePairs() {
val buffer = Buffer()

// that's a U+1F31A, a.k.a. "new moon with face"
buffer.writeString("🌚")
// it should be encoded with 4 code units
assertEquals(4, buffer.size)

// let's read it back as a single code point
val moonCodePoint = buffer.readCodePointValue()
// all code units were consumed
assertEquals(0, buffer.size)

// the moon is too wide to fit in a single UTF-16 character!
assertNotEquals(moonCodePoint, moonCodePoint.toChar().code)
// "too wide" means in the [U+010000, U+10FFFF] range
assertTrue(moonCodePoint in 0x10000..0x10FFFF)

// See https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF for details
val highSurrogate = (0xD800 + (moonCodePoint - 0x10000).ushr(10)).toChar()
val lowSurrogate = (0xDC00 + (moonCodePoint - 0x10000).and(0x3FF)).toChar()

assertContentEquals(charArrayOf(highSurrogate, lowSurrogate), "🌚".toCharArray())
}

@Test
Expand Down