Kotlin · koperagen · May 19, 2025 · May 14, 2025 · May 15, 2025 · May 15, 2025
diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/split.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/split.kt
@@ -7,6 +7,8 @@ import org.jetbrains.kotlinx.dataframe.DataColumn
 import org.jetbrains.kotlinx.dataframe.DataFrame
 import org.jetbrains.kotlinx.dataframe.DataRow
 import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload
+import org.jetbrains.kotlinx.dataframe.annotations.Interpretable
+import org.jetbrains.kotlinx.dataframe.annotations.Refine
 import org.jetbrains.kotlinx.dataframe.columns.ColumnAccessor
 import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
 import org.jetbrains.kotlinx.dataframe.columns.ColumnSet
@@ -18,10 +20,12 @@ import org.jetbrains.kotlinx.dataframe.impl.api.withRowCellImpl
 import org.jetbrains.kotlinx.dataframe.impl.asList
 import org.jetbrains.kotlinx.dataframe.impl.columnName
 import org.jetbrains.kotlinx.dataframe.impl.getListType
+import org.jetbrains.kotlinx.dataframe.util.SPLIT_STR
 import kotlin.reflect.KProperty
 import kotlin.reflect.KType
 import kotlin.reflect.typeOf
 
+@Interpretable("Split0")
 public fun <T, C> DataFrame<T>.split(columns: ColumnsSelector<T, C?>): Split<T, C> = Split(this, columns)
 
 public fun <T> DataFrame<T>.split(vararg columns: String): Split<T, Any> = split { columns.toColumnSet() }
@@ -62,22 +66,27 @@ public typealias ColumnNamesGenerator<C> = ColumnWithPath<C>.(extraColumnIndex:
 
 // region default
 
+@Interpretable("SplitDefault")
 public inline fun <T, C : Iterable<R>, reified R> Split<T, C>.default(value: R?): SplitWithTransform<T, C, R> =
     by { it }.default(value)
 
+@Deprecated(SPLIT_STR, ReplaceWith("""by(",").default(value)"""))
 public fun <T> Split<T, String>.default(value: String?): SplitWithTransform<T, String, String> =
     by { it.splitDefault() }.default(value)
 
+@Interpretable("SplitWithTransformDefault")
 public fun <T, C, R> SplitWithTransform<T, C, R>.default(value: R?): SplitWithTransform<T, C, R> = copy(default = value)
 
 // endregion
 
 // region by
 
+@Interpretable("ByIterable")
 public inline fun <T, C, reified R> Split<T, C>.by(
     noinline splitter: DataRow<T>.(C) -> Iterable<R>,
 ): SplitWithTransform<T, C, R> = by(typeOf<R>(), splitter)
 
+@Interpretable("ByCharDelimiters")
 public fun <T, C> Split<T, C>.by(
     vararg delimiters: Char,
     trim: Boolean = true,
@@ -90,6 +99,22 @@ public fun <T, C> Split<T, C>.by(
         }
     }
 
+/**
+ * Example:
+ * ```
+ * dataFrameOf("str" to listOf("1    2 3     4"))
+ *   .split("str").by("\\s+".toRegex())
+ *   // when the list of explicitly specified columnNames is not long enough (or none at all),
+ *   // names for additional columns are generates
+ *   .into()
+ * ```
+ * Result:
+ * ```
+ *    split1 split2 split3 split4
+ *         1      2      3      4
+ * ```
+ */
+@Interpretable("ByRegex")
 public fun <T, C> Split<T, C>.by(
     regex: Regex,
     trim: Boolean = true,
@@ -101,6 +126,7 @@ public fun <T, C> Split<T, C>.by(
         }
     }
 
+@Interpretable("ByStringDelimiters")
 public fun <T, C> Split<T, C>.by(
     vararg delimiters: String,
     trim: Boolean = true,
@@ -126,10 +152,26 @@ internal inline fun <T, C, R> Split<T, C>.by(
 
 // region match
 
+/**
+ * Creates new String columns according to MatchResult [capturing groups](https://kotlinlang.org/api/core/kotlin-stdlib/kotlin.text/-match-result/group-values.html),
+ * excluding the first group which is entire matched String.
+ * Example:
+ * ```
+ * dataFrameOf("str" to listOf("100 ml", "1 L"))
+ *      .split { "str"<String>() }.match("(\\d+)\\s*(ml|l|L)").into("volume", "unit")
+ * ```
+ * Created columns will be nullable if [regex] doesn't match some rows or there are nulls in original column
+ * Check [Split.by] overload with regex parameter if you're looking to split String value by [Regex] delimiter
+ */
+@Interpretable("MatchStringRegex")
 public fun <T, C : String?> Split<T, C>.match(
     @Language("RegExp") regex: String,
 ): SplitWithTransform<T, C, String?> = match(regex.toRegex())
 
+/**
+ * @include [match]
+ */
+@Interpretable("MatchRegex")
 public fun <T, C : String?> Split<T, C>.match(regex: Regex): SplitWithTransform<T, C, String?> =
     by {
         it?.let {
@@ -171,6 +213,8 @@ public fun <T, C, R> SplitWithTransform<T, C, R>.into(
     vararg otherNames: KProperty<*>,
 ): DataFrame<T> = into(listOf(firstName.columnName) + otherNames.map { it.columnName })
 
+@Refine
+@Interpretable("SplitWithTransformInto0")
 public fun <T, C, R> SplitWithTransform<T, C, R>.into(
     vararg names: String,
     extraNamesGenerator: (ColumnWithPath<C>.(extraColumnIndex: Int) -> String)? = null,
@@ -188,6 +232,8 @@ public fun <T, C, R> SplitWithTransform<T, C, R>.into(
         }
     }
 
+@Refine
+@Interpretable("SplitIterableInto")
 public fun <T, C : Iterable<*>> Split<T, C>.into(
     vararg names: String,
     extraNamesGenerator: ColumnNamesGenerator<C>? = null,
@@ -199,6 +245,8 @@ public fun <T, C> Split<T, DataFrame<C>>.into(
     extraNamesGenerator: ColumnNamesGenerator<DataFrame<C>>? = null,
 ): DataFrame<T> = by { it.rows() }.into(names.toList(), extraNamesGenerator)
 
+@Refine
+@Interpretable("SplitPair")
 public fun <T, A, B> Split<T, Pair<A, B>>.into(firstCol: String, secondCol: String): DataFrame<T> =
     by { listOf(it.first, it.second) }.into(firstCol, secondCol)
 
@@ -211,6 +259,7 @@ public inline fun <T, reified A, reified B> Split<T, Pair<A, B>>.into(
     secondCol: ColumnAccessor<B>,
 ): DataFrame<T> = by { listOf(it.first, it.second) }.into(firstCol, secondCol)
 
+@Deprecated(SPLIT_STR, ReplaceWith("""by(",").into(*names, extraNamesGenerator = extraNamesGenerator)"""))
 @JvmName("intoTC")
 public fun <T> Split<T, String>.into(
     vararg names: String,
@@ -226,6 +275,8 @@ public fun <T, C, R> SplitWithTransform<T, C, R>.inward(
     extraNamesGenerator: ColumnNamesGenerator<C>? = null,
 ): DataFrame<T> = copy(inward = true).into(names.toList(), extraNamesGenerator)
 
+@Refine
+@Interpretable("SplitWithTransformInward0")
 public fun <T, C, R> SplitWithTransform<T, C, R>.inward(
     vararg names: String,
     extraNamesGenerator: ColumnNamesGenerator<C>? = null,
@@ -272,6 +323,7 @@ public inline fun <T, reified A, reified B> Split<T, Pair<A, B>>.inward(
     secondCol: ColumnAccessor<B>,
 ): DataFrame<T> = by { listOf(it.first, it.second) }.inward(firstCol, secondCol)
 
+@Deprecated(SPLIT_STR, ReplaceWith("""by(",").inward(*names, extraNamesGenerator = extraNamesGenerator)"""))
 @JvmName("inwardTC")
 public fun <T> Split<T, String>.inward(
     vararg names: String,
@@ -282,6 +334,8 @@ public fun <T> Split<T, String>.inward(
 
 // region intoColumns
 
+@Refine
+@Interpretable("SplitAnyFrameIntoColumns")
 public fun <T, C : AnyFrame> Split<T, C>.intoColumns(): DataFrame<T> =
     df.convert(columns).with {
         when {
@@ -296,11 +350,15 @@ public fun <T, C : AnyFrame> Split<T, C>.intoColumns(): DataFrame<T> =
 // region intoRows
 
 @JvmName("intoRowsTC")
+@Refine
+@Interpretable("SplitIntoRows")
 public inline fun <T, C : Iterable<R>, reified R> Split<T, C>.intoRows(dropEmpty: Boolean = true): DataFrame<T> =
     by { it }
         .intoRows(dropEmpty)
 
 @JvmName("intoRowsFrame")
+@Refine
+@Interpretable("SplitAnyFrameRows")
 public fun <T, C : AnyFrame> Split<T, C>.intoRows(dropEmpty: Boolean = true): DataFrame<T> =
     by { it.rows() }.intoRows(dropEmpty)
 
@@ -309,6 +367,8 @@ internal inline fun <T, C, R> Convert<T, C?>.splitInplace(
     crossinline transform: DataRow<T>.(C) -> Iterable<R>,
 ) = withRowCellImpl(getListType(type), Infer.None) { if (it == null) emptyList() else transform(it).asList() }
 
+@Refine
+@Interpretable("SplitWithTransformIntoRows")
 public fun <T, C, R> SplitWithTransform<T, C, R>.intoRows(dropEmpty: Boolean = true): DataFrame<T> {
     val paths = df.getColumnPaths(columns).toColumnSet()
     return df.convert { paths as ColumnSet<C?> }.splitInplace(tartypeOf, transform).explode(dropEmpty) { paths }
@@ -319,8 +379,12 @@ public fun <T, C, R> SplitWithTransform<T, C, R>.intoRows(dropEmpty: Boolean = t
 // region inplace
 
 @JvmName("inplaceTC")
+@Refine
+@Interpretable("SplitInplace")
 public inline fun <T, C : Iterable<R>, reified R> Split<T, C>.inplace(): DataFrame<T> = by { it }.inplace()
 
+@Refine
+@Interpretable("SplitWithTransformInplace")
 public fun <T, C, R> SplitWithTransform<T, C, R>.inplace(): DataFrame<T> =
     df.convert(columns).splitInplace(tartypeOf, transform)
 

diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/util/deprecationMessages.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/util/deprecationMessages.kt
@@ -160,6 +160,9 @@ internal const val TO_CSV = "toCsv() is deprecated in favor of toCsvStr() in dat
 internal const val TO_CSV_IMPORT = "org.jetbrains.kotlinx.dataframe.io.toCsvStr"
 internal const val TO_CSV_REPLACE = "this.toCsvStr()"
 
+internal const val SPLIT_STR =
+    "Please explicitly specify how the String should be split. This shortcut will be removed in version 1.1.0"
+
 // endregion
 
 // region keep across releases

diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/testSets/person/DataFrameTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/testSets/person/DataFrameTests.kt
@@ -2184,6 +2184,14 @@ class DataFrameTests : BaseTest() {
         split["name"] shouldBe typed.name.map { it.toCharArray().toList() }
     }
 
+    @Test
+    fun `split iterable inplace`() {
+        val df = dataFrameOf("a" to listOf(listOf(1), null)).split { "a"<List<Int>?>() }.inplace()
+
+        df["a"].type() shouldBe typeOf<List<Int>>()
+        df["a"].values() shouldBe listOf(listOf(1), emptyList())
+    }
+
     @Test
     fun `split into rows with transform`() {
         val split = typed.split { city }.by { it.toCharArray().toList() }.intoRows()