Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/split.kt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.DataRow
import org.jetbrains.kotlinx.dataframe.annotations.AccessApiOverload
import org.jetbrains.kotlinx.dataframe.annotations.Interpretable
import org.jetbrains.kotlinx.dataframe.annotations.Refine
import org.jetbrains.kotlinx.dataframe.columns.ColumnAccessor
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
import org.jetbrains.kotlinx.dataframe.columns.ColumnSet
Expand All @@ -18,10 +20,12 @@ import org.jetbrains.kotlinx.dataframe.impl.api.withRowCellImpl
import org.jetbrains.kotlinx.dataframe.impl.asList
import org.jetbrains.kotlinx.dataframe.impl.columnName
import org.jetbrains.kotlinx.dataframe.impl.getListType
import org.jetbrains.kotlinx.dataframe.util.SPLIT_STR
import kotlin.reflect.KProperty
import kotlin.reflect.KType
import kotlin.reflect.typeOf

@Interpretable("Split0")
public fun <T, C> DataFrame<T>.split(columns: ColumnsSelector<T, C?>): Split<T, C> = Split(this, columns)

public fun <T> DataFrame<T>.split(vararg columns: String): Split<T, Any> = split { columns.toColumnSet() }
Expand Down Expand Up @@ -62,22 +66,27 @@ public typealias ColumnNamesGenerator<C> = ColumnWithPath<C>.(extraColumnIndex:

// region default

@Interpretable("SplitDefault")
public inline fun <T, C : Iterable<R>, reified R> Split<T, C>.default(value: R?): SplitWithTransform<T, C, R> =
by { it }.default(value)

@Deprecated(SPLIT_STR, ReplaceWith("""by(",").default(value)"""))
public fun <T> Split<T, String>.default(value: String?): SplitWithTransform<T, String, String> =
by { it.splitDefault() }.default(value)

@Interpretable("SplitWithTransformDefault")
public fun <T, C, R> SplitWithTransform<T, C, R>.default(value: R?): SplitWithTransform<T, C, R> = copy(default = value)

// endregion

// region by

@Interpretable("ByIterable")
public inline fun <T, C, reified R> Split<T, C>.by(
noinline splitter: DataRow<T>.(C) -> Iterable<R>,
): SplitWithTransform<T, C, R> = by(typeOf<R>(), splitter)

@Interpretable("ByCharDelimiters")
public fun <T, C> Split<T, C>.by(
vararg delimiters: Char,
trim: Boolean = true,
Expand All @@ -90,6 +99,22 @@ public fun <T, C> Split<T, C>.by(
}
}

/**
* Example:
* ```
* dataFrameOf("str" to listOf("1 2 3 4"))
* .split("str").by("\\s+".toRegex())
* // when the list of explicitly specified columnNames is not long enough (or none at all),
* // names for additional columns are generates
* .into()
* ```
* Result:
* ```
* split1 split2 split3 split4
* 1 2 3 4
* ```
*/
@Interpretable("ByRegex")
public fun <T, C> Split<T, C>.by(
regex: Regex,
trim: Boolean = true,
Expand All @@ -101,6 +126,7 @@ public fun <T, C> Split<T, C>.by(
}
}

@Interpretable("ByStringDelimiters")
public fun <T, C> Split<T, C>.by(
vararg delimiters: String,
trim: Boolean = true,
Expand All @@ -126,10 +152,26 @@ internal inline fun <T, C, R> Split<T, C>.by(

// region match

/**
* Creates new String columns according to MatchResult [capturing groups](https://kotlinlang.org/api/core/kotlin-stdlib/kotlin.text/-match-result/group-values.html),
* excluding the first group which is entire matched String.
* Example:
* ```
* dataFrameOf("str" to listOf("100 ml", "1 L"))
* .split { "str"<String>() }.match("(\\d+)\\s*(ml|l|L)").into("volume", "unit")
* ```
* Created columns will be nullable if [regex] doesn't match some rows or there are nulls in original column
* Check [Split.by] overload with regex parameter if you're looking to split String value by [Regex] delimiter
*/
@Interpretable("MatchStringRegex")
public fun <T, C : String?> Split<T, C>.match(
@Language("RegExp") regex: String,
): SplitWithTransform<T, C, String?> = match(regex.toRegex())

/**
* @include [match]
*/
@Interpretable("MatchRegex")
public fun <T, C : String?> Split<T, C>.match(regex: Regex): SplitWithTransform<T, C, String?> =
by {
it?.let {
Expand Down Expand Up @@ -171,6 +213,8 @@ public fun <T, C, R> SplitWithTransform<T, C, R>.into(
vararg otherNames: KProperty<*>,
): DataFrame<T> = into(listOf(firstName.columnName) + otherNames.map { it.columnName })

@Refine
@Interpretable("SplitWithTransformInto0")
public fun <T, C, R> SplitWithTransform<T, C, R>.into(
vararg names: String,
extraNamesGenerator: (ColumnWithPath<C>.(extraColumnIndex: Int) -> String)? = null,
Expand All @@ -188,6 +232,8 @@ public fun <T, C, R> SplitWithTransform<T, C, R>.into(
}
}

@Refine
@Interpretable("SplitIterableInto")
public fun <T, C : Iterable<*>> Split<T, C>.into(
vararg names: String,
extraNamesGenerator: ColumnNamesGenerator<C>? = null,
Expand All @@ -199,6 +245,8 @@ public fun <T, C> Split<T, DataFrame<C>>.into(
extraNamesGenerator: ColumnNamesGenerator<DataFrame<C>>? = null,
): DataFrame<T> = by { it.rows() }.into(names.toList(), extraNamesGenerator)

@Refine
@Interpretable("SplitPair")
public fun <T, A, B> Split<T, Pair<A, B>>.into(firstCol: String, secondCol: String): DataFrame<T> =
by { listOf(it.first, it.second) }.into(firstCol, secondCol)

Expand All @@ -211,6 +259,7 @@ public inline fun <T, reified A, reified B> Split<T, Pair<A, B>>.into(
secondCol: ColumnAccessor<B>,
): DataFrame<T> = by { listOf(it.first, it.second) }.into(firstCol, secondCol)

@Deprecated(SPLIT_STR, ReplaceWith("""by(",").into(*names, extraNamesGenerator = extraNamesGenerator)"""))
@JvmName("intoTC")
public fun <T> Split<T, String>.into(
vararg names: String,
Expand All @@ -226,6 +275,8 @@ public fun <T, C, R> SplitWithTransform<T, C, R>.inward(
extraNamesGenerator: ColumnNamesGenerator<C>? = null,
): DataFrame<T> = copy(inward = true).into(names.toList(), extraNamesGenerator)

@Refine
@Interpretable("SplitWithTransformInward0")
public fun <T, C, R> SplitWithTransform<T, C, R>.inward(
vararg names: String,
extraNamesGenerator: ColumnNamesGenerator<C>? = null,
Expand Down Expand Up @@ -272,6 +323,7 @@ public inline fun <T, reified A, reified B> Split<T, Pair<A, B>>.inward(
secondCol: ColumnAccessor<B>,
): DataFrame<T> = by { listOf(it.first, it.second) }.inward(firstCol, secondCol)

@Deprecated(SPLIT_STR, ReplaceWith("""by(",").inward(*names, extraNamesGenerator = extraNamesGenerator)"""))
@JvmName("inwardTC")
public fun <T> Split<T, String>.inward(
vararg names: String,
Expand All @@ -282,6 +334,8 @@ public fun <T> Split<T, String>.inward(

// region intoColumns

@Refine
@Interpretable("SplitAnyFrameIntoColumns")
public fun <T, C : AnyFrame> Split<T, C>.intoColumns(): DataFrame<T> =
df.convert(columns).with {
when {
Expand All @@ -296,11 +350,15 @@ public fun <T, C : AnyFrame> Split<T, C>.intoColumns(): DataFrame<T> =
// region intoRows

@JvmName("intoRowsTC")
@Refine
@Interpretable("SplitIntoRows")
public inline fun <T, C : Iterable<R>, reified R> Split<T, C>.intoRows(dropEmpty: Boolean = true): DataFrame<T> =
by { it }
.intoRows(dropEmpty)

@JvmName("intoRowsFrame")
@Refine
@Interpretable("SplitAnyFrameRows")
public fun <T, C : AnyFrame> Split<T, C>.intoRows(dropEmpty: Boolean = true): DataFrame<T> =
by { it.rows() }.intoRows(dropEmpty)

Expand All @@ -309,6 +367,8 @@ internal inline fun <T, C, R> Convert<T, C?>.splitInplace(
crossinline transform: DataRow<T>.(C) -> Iterable<R>,
) = withRowCellImpl(getListType(type), Infer.None) { if (it == null) emptyList() else transform(it).asList() }

@Refine
@Interpretable("SplitWithTransformIntoRows")
public fun <T, C, R> SplitWithTransform<T, C, R>.intoRows(dropEmpty: Boolean = true): DataFrame<T> {
val paths = df.getColumnPaths(columns).toColumnSet()
return df.convert { paths as ColumnSet<C?> }.splitInplace(tartypeOf, transform).explode(dropEmpty) { paths }
Expand All @@ -319,8 +379,12 @@ public fun <T, C, R> SplitWithTransform<T, C, R>.intoRows(dropEmpty: Boolean = t
// region inplace

@JvmName("inplaceTC")
@Refine
@Interpretable("SplitInplace")
public inline fun <T, C : Iterable<R>, reified R> Split<T, C>.inplace(): DataFrame<T> = by { it }.inplace()

@Refine
@Interpretable("SplitWithTransformInplace")
public fun <T, C, R> SplitWithTransform<T, C, R>.inplace(): DataFrame<T> =
df.convert(columns).splitInplace(tartypeOf, transform)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ internal const val TO_CSV = "toCsv() is deprecated in favor of toCsvStr() in dat
internal const val TO_CSV_IMPORT = "org.jetbrains.kotlinx.dataframe.io.toCsvStr"
internal const val TO_CSV_REPLACE = "this.toCsvStr()"

internal const val SPLIT_STR =
"Please explicitly specify how the String should be split. This shortcut will be removed in version 1.1.0"

// endregion

// region keep across releases
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2184,6 +2184,14 @@ class DataFrameTests : BaseTest() {
split["name"] shouldBe typed.name.map { it.toCharArray().toList() }
}

@Test
fun `split iterable inplace`() {
val df = dataFrameOf("a" to listOf(listOf(1), null)).split { "a"<List<Int>?>() }.inplace()

df["a"].type() shouldBe typeOf<List<Int>>()
df["a"].values() shouldBe listOf(listOf(1), emptyList())
}

@Test
fun `split into rows with transform`() {
val split = typed.split { city }.by { it.toCharArray().toList() }.intoRows()
Expand Down
Loading