Skip to content

Fix new columns names generation during split operation (#224) #240

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.SplitWithTransform
import org.jetbrains.kotlinx.dataframe.api.rows
import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
import org.jetbrains.kotlinx.dataframe.columns.ColumnWithPath
import org.jetbrains.kotlinx.dataframe.impl.ColumnDataCollector
import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnWithPath
import org.jetbrains.kotlinx.dataframe.impl.createDataCollector
import org.jetbrains.kotlinx.dataframe.impl.nameGenerator
import org.jetbrains.kotlinx.dataframe.nrow

internal fun valueToList(value: Any?, splitStrings: Boolean = true): List<Any?> = when (value) {
Expand All @@ -25,9 +25,13 @@ internal fun <T, C, R> splitImpl(
val nrow = clause.df.nrow

val removeResult = clause.df.removeImpl(columns = clause.columns)
val nameGenerator = removeResult.df.nameGenerator()

val toInsert = removeResult.removedColumns.flatMap { node ->
// As we insert multiple columns at once it's possible to encounter a name conflict within a batch of a new columns
// that's why an old school mutable list is used here to check for name conflicts in intermediate steps of generation
// of columns to insert
val columnsToInsert = mutableListOf<ColumnToInsert>()

removeResult.removedColumns.forEach { node ->

val column = node.toColumnWithPath<C>()
val columnCollectors = mutableListOf<ColumnDataCollector>()
Expand All @@ -46,22 +50,52 @@ internal fun <T, C, R> splitImpl(
columnCollectors[j].add(clause.default)
}

var names = columnNamesGenerator(column, columnCollectors.size)
if (names.size < columnCollectors.size) {
names = names + (1..(columnCollectors.size - names.size)).map { "split$it" }
}
val names = columnNamesGenerator(column, columnCollectors.size)
val sourcePath = node.pathFromRoot()

columnCollectors.forEachIndexed { i, col ->
val preferredName = names.getOrNull(i)

columnCollectors.mapIndexed { i, col ->
val pathToInsert = if (clause.inward) sourcePath else sourcePath.dropLast(1)
val name = generateUnusedName(removeResult.df, preferredName, pathToInsert, columnsToInsert)

val path = pathToInsert + name

val name = nameGenerator.addUnique(names[i])
val sourcePath = node.pathFromRoot()
val path = if (clause.inward) sourcePath + name else sourcePath.dropLast(1) + name
val data = col.toColumn(name)
ColumnToInsert(path, data, node)
columnsToInsert.add(ColumnToInsert(path, data, node))
}
}

return removeResult.df.insertImpl(columnsToInsert)
}

internal fun generateUnusedName(
df: DataFrame<*>,
preferredName: String?,
insertPath: ColumnPath,
columnsToBeInserted: List<ColumnToInsert>
): String {
// check if column with this name already exists in the df in the same position in the hierarchy,
// or we already have a column with this name in the list of columns to be inserted to the same position in the hierarchy
fun isUsed(name: String) =
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not very fond of inner functions, but you do reuse arguments from the outer function and it makes things more readable, so I think I'll let it slide :)

df.getColumnOrNull(insertPath + name) != null || columnsToBeInserted.any { it.insertionPath == insertPath + name }

fun generateNameVariationByTryingNumericSuffixes(original: String? = null, startSuffix: Int): String {
var k = startSuffix
var name = original ?: "split$k"
while (isUsed(name)) {
name = original ?: "split"
name = "${name}${k++}"
}

return name
}

return removeResult.df.insertImpl(toInsert)
return if (preferredName == null) {
generateNameVariationByTryingNumericSuffixes(startSuffix = 1)
} else {
generateNameVariationByTryingNumericSuffixes(preferredName, 1)
}
}

internal fun String.splitDefault() = split(",").map { it.trim() }
105 changes: 105 additions & 0 deletions core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/split.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,16 @@ import io.kotest.matchers.shouldBe
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.hasNulls
import org.jetbrains.kotlinx.dataframe.impl.DataRowImpl
import org.jetbrains.kotlinx.dataframe.type
import org.junit.Test
import kotlin.reflect.typeOf

class SplitTests {

val stringPairDf = dataFrameOf("first", "second")("22-65", "22-66")
val listPairDf = dataFrameOf("first", "second")(listOf("22", "65"), listOf("22", "66"))

@Test
fun `split with default`() {
val recentDelays = listOf(listOf(23, 47), listOf(), listOf(24, 43, 87), listOf(13), listOf(67, 32)).toColumn("RecentDelays")
Expand Down Expand Up @@ -60,4 +64,105 @@ class SplitTests {
3, emptyList<Int>(), emptyList<Int>()
)
}

@Test
fun `split string by delimiter inward`() {
val res = stringPairDf.split("first", "second").by("-").inward("left", "right")

res shouldBe dataFrameOf(
columnOf(columnOf("22") named "left", columnOf("65") named "right") named "first",
columnOf(columnOf("22") named "left", columnOf("66") named "right") named "second"
)
}

@Test
fun `split string by delimiter into columns with suffixes`() {
val res = stringPairDf.split("first", "second").by("-").into("left", "right")

res shouldBe dataFrameOf(
columnOf("22") named "left",
columnOf("65") named "right",
columnOf("22") named "left1",
columnOf("66") named "right1"
)
}

@Test
fun `split list inward with autogenerated names`() {
val res = listPairDf.split { "first"<List<String>>() and "second"<List<String>>() }.inward()

res shouldBe dataFrameOf(
columnOf(columnOf("22") named "split1", columnOf("65") named "split2") named "first",
columnOf(columnOf("22") named "split1", columnOf("66") named "split2") named "second"
)
}

@Test
fun `split list into with autogenerated names`() {
val res = listPairDf.split { "first"<List<String>>() and "second"<List<String>>() }.into()

res shouldBe dataFrameOf(
columnOf("22") named "split1",
columnOf("65") named "split2",
columnOf("22") named "split3",
columnOf("66") named "split4"
)
}

@Test
fun `sequence of splits with autogenerated names`() {
var res = listPairDf.split { "first"<List<String>>() }.into()
res = res.split { "second"<List<String>>() }.into()

res shouldBe dataFrameOf(
columnOf("22") named "split1",
columnOf("65") named "split2",
columnOf("22") named "split3",
columnOf("66") named "split4"
)
}

@Test
fun `split column group inward`() {
val df = stringPairDf.group("first", "second").into("group")

// Note: this operation replaces original columns in group so there is no name conflict
val res = df.split { "group"<DataRowImpl<*>>() }
.by { it -> listOf(it[1], it[0]) } // swap columns
.inward("first", "second") // no name conflict

res shouldBe dataFrameOf(
columnOf(columnOf("22-66") named "first", columnOf("22-65") named "second") named "group"
)
}

@Test
fun `split column group into hierarchy with correct names`() {
val df = dataFrameOf(
columnOf(
columnOf("a") named "first",
columnOf(
columnOf("b") named "first",
columnOf("c") named "second"
) named "nestedGroup"
) named "topLevelGroup",
columnOf("d") named "first",
)

val topLevelGroup by columnGroup()
val nestedGroup by topLevelGroup.columnGroup()

val res = df.split { nestedGroup }
.by { it -> listOf(it[0], it[1]) }
.into("first", "second") // name conflict

res shouldBe dataFrameOf(
columnOf(
columnOf("a") named "first",
columnOf("b") named "first1",
columnOf("c") named "second"
) named "topLevelGroup",
columnOf("d") named "first",
)
}
}