Kotlin · fzhinkin · May 7, 2024 · May 3, 2024 · May 6, 2024 · May 7, 2024
diff --git a/core/api/kotlinx-io-core.api b/core/api/kotlinx-io-core.api
@@ -187,12 +187,14 @@ public final class kotlinx/io/SourcesKt {
 }
 
 public final class kotlinx/io/Utf8Kt {
+	public static final fun readCodePointValue (Lkotlinx/io/Source;)I
 	public static final fun readLine (Lkotlinx/io/Source;)Ljava/lang/String;
 	public static final fun readLineStrict (Lkotlinx/io/Source;J)Ljava/lang/String;
 	public static synthetic fun readLineStrict$default (Lkotlinx/io/Source;JILjava/lang/Object;)Ljava/lang/String;
 	public static final fun readString (Lkotlinx/io/Buffer;)Ljava/lang/String;
 	public static final fun readString (Lkotlinx/io/Source;)Ljava/lang/String;
 	public static final fun readString (Lkotlinx/io/Source;J)Ljava/lang/String;
+	public static final fun writeCodePointValue (Lkotlinx/io/Sink;I)V
 	public static final fun writeString (Lkotlinx/io/Sink;Ljava/lang/String;II)V
 	public static synthetic fun writeString$default (Lkotlinx/io/Sink;Ljava/lang/String;IIILjava/lang/Object;)V
 }

diff --git a/core/api/kotlinx-io-core.klib.api b/core/api/kotlinx-io-core.klib.api
@@ -82,6 +82,7 @@ final fun (kotlinx.io/Buffer).kotlinx.io/snapshot(): kotlinx.io.bytestring/ByteS
 final fun (kotlinx.io/RawSink).kotlinx.io/buffered(): kotlinx.io/Sink // kotlinx.io/buffered|buffered@kotlinx.io.RawSink(){}[0]
 final fun (kotlinx.io/RawSource).kotlinx.io/buffered(): kotlinx.io/Source // kotlinx.io/buffered|buffered@kotlinx.io.RawSource(){}[0]
 final fun (kotlinx.io/Sink).kotlinx.io/write(kotlinx.io.bytestring/ByteString, kotlin/Int =..., kotlin/Int =...) // kotlinx.io/write|write@kotlinx.io.Sink(kotlinx.io.bytestring.ByteString;kotlin.Int;kotlin.Int){}[0]
+final fun (kotlinx.io/Sink).kotlinx.io/writeCodePointValue(kotlin/Int) // kotlinx.io/writeCodePointValue|writeCodePointValue@kotlinx.io.Sink(kotlin.Int){}[0]
 final fun (kotlinx.io/Sink).kotlinx.io/writeDecimalLong(kotlin/Long) // kotlinx.io/writeDecimalLong|writeDecimalLong@kotlinx.io.Sink(kotlin.Long){}[0]
 final fun (kotlinx.io/Sink).kotlinx.io/writeDouble(kotlin/Double) // kotlinx.io/writeDouble|writeDouble@kotlinx.io.Sink(kotlin.Double){}[0]
 final fun (kotlinx.io/Sink).kotlinx.io/writeDoubleLe(kotlin/Double) // kotlinx.io/writeDoubleLe|writeDoubleLe@kotlinx.io.Sink(kotlin.Double){}[0]
@@ -105,6 +106,7 @@ final fun (kotlinx.io/Source).kotlinx.io/readByteArray(): kotlin/ByteArray // ko
 final fun (kotlinx.io/Source).kotlinx.io/readByteArray(kotlin/Int): kotlin/ByteArray // kotlinx.io/readByteArray|readByteArray@kotlinx.io.Source(kotlin.Int){}[0]
 final fun (kotlinx.io/Source).kotlinx.io/readByteString(): kotlinx.io.bytestring/ByteString // kotlinx.io/readByteString|readByteString@kotlinx.io.Source(){}[0]
 final fun (kotlinx.io/Source).kotlinx.io/readByteString(kotlin/Int): kotlinx.io.bytestring/ByteString // kotlinx.io/readByteString|readByteString@kotlinx.io.Source(kotlin.Int){}[0]
+final fun (kotlinx.io/Source).kotlinx.io/readCodePointValue(): kotlin/Int // kotlinx.io/readCodePointValue|readCodePointValue@kotlinx.io.Source(){}[0]
 final fun (kotlinx.io/Source).kotlinx.io/readDecimalLong(): kotlin/Long // kotlinx.io/readDecimalLong|readDecimalLong@kotlinx.io.Source(){}[0]
 final fun (kotlinx.io/Source).kotlinx.io/readDouble(): kotlin/Double // kotlinx.io/readDouble|readDouble@kotlinx.io.Source(){}[0]
 final fun (kotlinx.io/Source).kotlinx.io/readDoubleLe(): kotlin/Double // kotlinx.io/readDoubleLe|readDoubleLe@kotlinx.io.Source(){}[0]

diff --git a/core/common/src/Utf8.kt b/core/common/src/Utf8.kt
@@ -122,14 +122,23 @@ internal fun String.utf8Size(startIndex: Int = 0, endIndex: Int = length): Long
 /**
  * Encodes [codePoint] in UTF-8 and writes it to this sink.
  *
+ * Note that in general, a value retrieved from [Char.code] could not be written directly
+ * as it may be a part of a [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2) (that could be
+ * detected using [Char.isSurrogate], or [Char.isHighSurrogate] and [Char.isLowSurrogate]).
+ * Such a pair of characters needs to be manually converted back to a single code point
+ * which then could be written to a [Sink].
+ * Without such a conversion, data written to a [Sink] can not be converted back
+ * to a string from which a surrogate pair was retrieved.
+ *
  * @param codePoint the codePoint to be written.
  *
  * @throws IllegalStateException when the sink is closed.
  *
- * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.utf8CodePointSample
+ * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeUtf8CodePointSample
+ * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.writeSurrogatePair
  */
 @OptIn(DelicateIoApi::class)
-internal fun Sink.writeUtf8CodePoint(codePoint: Int): Unit =
+public fun Sink.writeCodePointValue(codePoint: Int): Unit =
     writeToInternalBuffer { it.commonWriteUtf8CodePoint(codePoint) }
 
 /**
@@ -196,24 +205,31 @@ public fun Source.readString(byteCount: Long): String {
 }
 
 /**
- * Removes and returns a single UTF-8 code point, reading between 1 and 4 bytes as necessary.
+ * Decodes a single code point value from UTF-8 code units, reading between 1 and 4 bytes as necessary.
  *
  * If this source is exhausted before a complete code point can be read, this throws an
  * [EOFException] and consumes no input.
  *
- * If this source doesn't start with a properly-encoded UTF-8 code point, this method will remove
- * 1 or more non-UTF-8 bytes and return the replacement character (`U+fffd`). This covers encoding
- * problems (the input is not properly-encoded UTF-8), characters out of range (beyond the
- * `0x10ffff` limit of Unicode), code points for UTF-16 surrogates (`U+d800`..`U+dfff`) and overlong
- * encodings (such as `0xc080` for the NUL character in modified UTF-8).
+ * If this source starts with an ill-formed UTF-8 code units sequence, this method will remove
+ * 1 or more non-UTF-8 bytes and return the replacement character (`U+fffd`).
+ *
+ * The replacement character (`U+fffd`) will be also returned if the source starts with a well-formed
+ * code units sequences, but a decoded value does not pass further validation, such as
+ * the value is out of range (beyond the `0x10ffff` limit of Unicode), maps to UTF-16 surrogates (`U+d800`..`U+dfff`),
+ * or an overlong encoding is detected (such as `0xc080` for the NUL character in modified UTF-8).
+ *
+ * Note that in general, returned value may not be directly converted to [Char] as it may be out
+ * of [Char]'s values range and should be manually converted to a
+ * [surrogate pair](https://www.unicode.org/faq/utf_bom.html#utf16-2).
  *
  * @throws EOFException when the source is exhausted before a complete code point can be read.
  * @throws IllegalStateException when the source is closed.
  *
  * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.readUtf8CodePointSample
+ * @sample kotlinx.io.samples.KotlinxIoCoreCommonSamples.surrogatePairs
  */
 @OptIn(InternalIoApi::class)
-internal fun Source.readUtf8CodePoint(): Int {
+public fun Source.readCodePointValue(): Int {
     require(1)
 
     val b0 = buffer[0].toInt()
@@ -226,13 +242,6 @@ internal fun Source.readUtf8CodePoint(): Int {
     return buffer.commonReadUtf8CodePoint()
 }
 
-/**
- * @see Source.readUtf8CodePoint
- */
-internal fun Buffer.readUtf8CodePoint(): Int {
-    return this.commonReadUtf8CodePoint()
-}
-
 /**
  * Removes and returns UTF-8 encoded characters up to but not including the next line break. A line break is
  * either `"\n"` or `"\r\n"`; these characters are not included in the result.

diff --git a/core/common/test/AbstractSourceTest.kt b/core/common/test/AbstractSourceTest.kt
@@ -1099,25 +1099,25 @@ abstract class AbstractBufferedSourceTest internal constructor(
         with(sink) {
             writeByte(0x7f)
             emit()
-            assertEquals(0x7f, source.readUtf8CodePoint().toLong())
+            assertEquals(0x7f, source.readCodePointValue().toLong())
 
             writeByte(0xdf.toByte())
             writeByte(0xbf.toByte())
             emit()
-            assertEquals(0x07ff, source.readUtf8CodePoint().toLong())
+            assertEquals(0x07ff, source.readCodePointValue().toLong())
 
             writeByte(0xef.toByte())
             writeByte(0xbf.toByte())
             writeByte(0xbf.toByte())
             emit()
-            assertEquals(0xffff, source.readUtf8CodePoint().toLong())
+            assertEquals(0xffff, source.readCodePointValue().toLong())
 
             writeByte(0xf4.toByte())
             writeByte(0x8f.toByte())
             writeByte(0xbf.toByte())
             writeByte(0xbf.toByte())
             emit()
-            assertEquals(0x10ffff, source.readUtf8CodePoint().toLong())
+            assertEquals(0x10ffff, source.readCodePointValue().toLong())
         }
     }
 
@@ -1126,20 +1126,20 @@ abstract class AbstractBufferedSourceTest internal constructor(
         with(sink) {
             writeByte(0xdf.toByte()) // a second byte is missing
             emit()
-            assertFailsWith<EOFException> { source.readUtf8CodePoint() }
+            assertFailsWith<EOFException> { source.readCodePointValue() }
             assertEquals(1, source.readByteArray().size)
 
             writeByte(0xe2.toByte())
             writeByte(0x98.toByte()) // a third byte is missing
             emit()
-            assertFailsWith<EOFException> { source.readUtf8CodePoint() }
+            assertFailsWith<EOFException> { source.readCodePointValue() }
             assertEquals(2, source.readByteArray().size)
 
             writeByte(0xf0.toByte())
             writeByte(0x9f.toByte())
             writeByte(0x92.toByte()) // a forth byte is missing
             emit()
-            assertFailsWith<EOFException> { source.readUtf8CodePoint() }
+            assertFailsWith<EOFException> { source.readCodePointValue() }
             assertEquals(3, source.readByteArray().size)
         }
     }

diff --git a/core/common/test/Utf8Test.kt b/core/common/test/Utf8Test.kt
@@ -285,22 +285,22 @@ class Utf8Test {
     @Test
     fun readCodePointFromEmptyBufferThrowsEofException() {
         val buffer = Buffer()
-        assertFailsWith<EOFException> { buffer.readUtf8CodePoint() }
+        assertFailsWith<EOFException> { buffer.readCodePointValue() }
     }
 
     @Test
     fun readLeadingContinuationByteReturnsReplacementCharacter() {
         val buffer = Buffer()
         buffer.writeByte(0xbf.toByte())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
         assertTrue(buffer.exhausted())
     }
 
     @Test
     fun readMissingContinuationBytesThrowsEofException() {
         val buffer = Buffer()
         buffer.writeByte(0xdf.toByte())
-        assertFailsWith<EOFException> { buffer.readUtf8CodePoint() }
+        assertFailsWith<EOFException> { buffer.readCodePointValue() }
         assertFalse(buffer.exhausted()) // Prefix byte wasn't consumed.
     }
 
@@ -309,21 +309,21 @@ class Utf8Test {
         // 5-byte and 6-byte code points are not supported.
         val buffer = Buffer()
         buffer.write("f888808080".decodeHex())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
         assertTrue(buffer.exhausted())
 
         buffer.write(ByteArray(Segment.SIZE - 2))
         buffer.write("f888808080".decodeHex())
         buffer.skip(Segment.SIZE - 2L)
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
         assertTrue(buffer.exhausted())
     }
 
@@ -332,8 +332,8 @@ class Utf8Test {
         // Use a non-continuation byte where a continuation byte is expected.
         val buffer = Buffer()
         buffer.write("df20".decodeHex())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
-        assertEquals(0x20, buffer.readUtf8CodePoint()) // Non-continuation character not consumed.
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
+        assertEquals(0x20, buffer.readCodePointValue()) // Non-continuation character not consumed.
         assertTrue(buffer.exhausted())
     }
 
@@ -342,18 +342,18 @@ class Utf8Test {
         // A 4-byte encoding with data above the U+10ffff Unicode maximum.
         val buffer = Buffer()
         buffer.write("f4908080".decodeHex())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
         assertTrue(buffer.exhausted())
     }
 
     @Test
     fun readSurrogateCodePoint() {
         val buffer = Buffer()
         buffer.write("eda080".decodeHex())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
         assertTrue(buffer.exhausted())
         buffer.write("edbfbf".decodeHex())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
         assertTrue(buffer.exhausted())
     }
 
@@ -362,15 +362,15 @@ class Utf8Test {
         // Use 2 bytes to encode data that only needs 1 byte.
         val buffer = Buffer()
         buffer.write("c080".decodeHex())
-        assertEquals(REPLACEMENT_CODE_POINT, buffer.readUtf8CodePoint())
+        assertEquals(REPLACEMENT_CODE_POINT, buffer.readCodePointValue())
         assertTrue(buffer.exhausted())
     }
 
     @Test
     fun writeCodePointBeyondUnicodeMaximum() {
         val buffer = Buffer()
         assertFailsWith<IllegalArgumentException>("Unexpected code point: 0x110000") {
-            buffer.writeUtf8CodePoint(0x110000)
+            buffer.writeCodePointValue(0x110000)
         }
     }
 
@@ -428,7 +428,7 @@ class Utf8Test {
 
     private fun Buffer.assertCodePointEncoded(expectedHex: String, codePoint: Int, prefixLength: Int = 0) {
         write(ByteArray(prefixLength))
-        writeUtf8CodePoint(codePoint)
+        writeCodePointValue(codePoint)
         skip(prefixLength.toLong())
         assertArrayEquals(expectedHex.decodeHex(), readByteArray())
     }
@@ -437,7 +437,7 @@ class Utf8Test {
         write(ByteArray(prefixLength))
         write(hex.decodeHex())
         skip(prefixLength.toLong())
-        assertEquals(expectedCodePoint, readUtf8CodePoint())
+        assertEquals(expectedCodePoint, readCodePointValue())
     }
 
     private fun Buffer.assertUtf8StringEncoded(expectedHex: String, string: String, prefixLength: Int = 0) {
@@ -469,7 +469,7 @@ class Utf8Test {
         val bufferUtf8 = Buffer()
         for (charIdx in string.indices) {
             val c = string[charIdx]
-            bufferUtf8.writeUtf8CodePoint(c.code)
+            bufferUtf8.writeCodePointValue(c.code)
         }
         assertArrayEquals(expectedUtf8, bufferUtf8.readByteArray())
 

diff --git a/core/common/test/samples/samples.kt b/core/common/test/samples/samples.kt
@@ -100,19 +100,84 @@ class KotlinxIoCoreCommonSamples {
     fun writeUtf8CodePointSample() {
         val buffer = Buffer()
 
-        buffer.writeInt('Δ'.code) // writes integer value as is
-        assertContentEquals(byteArrayOf(0, 0, 0x3, 0x94.toByte()), buffer.readByteArray())
+        // Basic Latin (a.k.a. ASCII) characters are encoded with a single byte
+        buffer.writeCodePointValue('Y'.code)
+        assertContentEquals(byteArrayOf(0x59), buffer.readByteArray())
 
-        buffer.writeUtf8CodePoint('Δ'.code) // encodes code point using UTF-8 encoding
+        // wider characters are encoded into multiple UTF-8 code units
+        buffer.writeCodePointValue('Δ'.code)
         assertContentEquals(byteArrayOf(0xce.toByte(), 0x94.toByte()), buffer.readByteArray())
+
+        // note the difference: writeInt won't encode the code point, like writeCodePointValue did
+        buffer.writeInt('Δ'.code)
+        assertContentEquals(byteArrayOf(0, 0, 0x3, 0x94.toByte()), buffer.readByteArray())
+    }
+
+    @Test
+    fun writeSurrogatePair() {
+        val buffer = Buffer()
+
+        // U+1F31E (a.k.a. "sun with face") is too wide to fit in a single UTF-16 character,
+        // so it's represented using a surrogate pair.
+        val chars = "🌞".toCharArray()
+        assertEquals(2, chars.size)
+
+        // such a pair has to be manually converted to a single code point
+        assertTrue(chars[0].isHighSurrogate())
+        assertTrue(chars[1].isLowSurrogate())
+
+        val highSurrogate = chars[0].code
+        val lowSurrogate = chars[1].code
+
+        // see https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF for details
+        val codePoint = 0x10000 + (highSurrogate - 0xD800).shl(10).or(lowSurrogate - 0xDC00)
+        assertEquals(0x1F31E, codePoint)
+
+        // now we can write the code point
+        buffer.writeCodePointValue(codePoint)
+        // and read the correct string back
+        assertEquals("🌞", buffer.readString())
+
+        // we won't achieve that by writing surrogates as it is
+        buffer.apply {
+            writeCodePointValue(highSurrogate)
+            writeCodePointValue(lowSurrogate)
+        }
+        assertNotEquals("🌞", buffer.readString())
     }
 
     @Test
     fun readUtf8CodePointSample() {
         val buffer = Buffer()
 
         buffer.writeUShort(0xce94U)
-        assertEquals(0x394, buffer.readUtf8CodePoint()) // decodes single UTF-8 encoded code point
+        assertEquals(0x394, buffer.readCodePointValue()) // decodes a single UTF-8 encoded code point
+    }
+
+    @Test
+    fun surrogatePairs() {
+        val buffer = Buffer()
+
+        // that's a U+1F31A, a.k.a. "new moon with face"
+        buffer.writeString("🌚")
+        // it should be encoded with 4 code units
+        assertEquals(4, buffer.size)
+
+        // let's read it back as a single code point
+        val moonCodePoint = buffer.readCodePointValue()
+        // all code units were consumed
+        assertEquals(0, buffer.size)
+
+        // the moon is too wide to fit in a single UTF-16 character!
+        assertNotEquals(moonCodePoint, moonCodePoint.toChar().code)
+        // "too wide" means in the [U+010000, U+10FFFF] range
+        assertTrue(moonCodePoint in 0x10000..0x10FFFF)
+
+        // See https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF for details
+        val highSurrogate = (0xD800 + (moonCodePoint - 0x10000).ushr(10)).toChar()
+        val lowSurrogate = (0xDC00 + (moonCodePoint - 0x10000).and(0x3FF)).toChar()
+
+        assertContentEquals(charArrayOf(highSurrogate, lowSurrogate), "🌚".toCharArray())
     }
 
     @Test