Merge pull request #240 from ermolenkodev/224

ermolenkodev · web-flow · commit a014fd559cf3 · 2023-01-25T19:10:05.000+03:00
Fix new columns names generation during split operation (#224)
diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/split.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/split.kt
@@ -4,11 +4,11 @@ import org.jetbrains.kotlinx.dataframe.AnyFrame
 import org.jetbrains.kotlinx.dataframe.DataFrame
 import org.jetbrains.kotlinx.dataframe.api.SplitWithTransform
 import org.jetbrains.kotlinx.dataframe.api.rows
+import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
 import org.jetbrains.kotlinx.dataframe.columns.ColumnWithPath
 import org.jetbrains.kotlinx.dataframe.impl.ColumnDataCollector
 import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnWithPath
 import org.jetbrains.kotlinx.dataframe.impl.createDataCollector
-import org.jetbrains.kotlinx.dataframe.impl.nameGenerator
 import org.jetbrains.kotlinx.dataframe.nrow
 
 internal fun valueToList(value: Any?, splitStrings: Boolean = true): List<Any?> = when (value) {
@@ -25,9 +25,13 @@ internal fun <T, C, R> splitImpl(
     val nrow = clause.df.nrow
 
     val removeResult = clause.df.removeImpl(columns = clause.columns)
-    val nameGenerator = removeResult.df.nameGenerator()
 
-    val toInsert = removeResult.removedColumns.flatMap { node ->
+    // As we insert multiple columns at once it's possible to encounter a name conflict within a batch of a new columns
+    // that's why an old school mutable list is used here to check for name conflicts in intermediate steps of generation
+    // of columns to insert
+    val columnsToInsert = mutableListOf<ColumnToInsert>()
+
+    removeResult.removedColumns.forEach { node ->
 
         val column = node.toColumnWithPath<C>()
         val columnCollectors = mutableListOf<ColumnDataCollector>()
@@ -46,22 +50,52 @@ internal fun <T, C, R> splitImpl(
                 columnCollectors[j].add(clause.default)
         }
 
-        var names = columnNamesGenerator(column, columnCollectors.size)
-        if (names.size < columnCollectors.size) {
-            names = names + (1..(columnCollectors.size - names.size)).map { "split$it" }
-        }
+        val names = columnNamesGenerator(column, columnCollectors.size)
+        val sourcePath = node.pathFromRoot()
+
+        columnCollectors.forEachIndexed { i, col ->
+            val preferredName = names.getOrNull(i)
 
-        columnCollectors.mapIndexed { i, col ->
+            val pathToInsert = if (clause.inward) sourcePath else sourcePath.dropLast(1)
+            val name = generateUnusedName(removeResult.df, preferredName, pathToInsert, columnsToInsert)
+
+            val path = pathToInsert + name
 
-            val name = nameGenerator.addUnique(names[i])
-            val sourcePath = node.pathFromRoot()
-            val path = if (clause.inward) sourcePath + name else sourcePath.dropLast(1) + name
             val data = col.toColumn(name)
-            ColumnToInsert(path, data, node)
+            columnsToInsert.add(ColumnToInsert(path, data, node))
+        }
+    }
+
+    return removeResult.df.insertImpl(columnsToInsert)
+}
+
+internal fun generateUnusedName(
+    df: DataFrame<*>,
+    preferredName: String?,
+    insertPath: ColumnPath,
+    columnsToBeInserted: List<ColumnToInsert>
+): String {
+    // check if column with this name already exists in the df in the same position in the hierarchy,
+    // or we already have a column with this name in the list of columns to be inserted to the same position in the hierarchy
+    fun isUsed(name: String) =
+        df.getColumnOrNull(insertPath + name) != null || columnsToBeInserted.any { it.insertionPath == insertPath + name }
+
+    fun generateNameVariationByTryingNumericSuffixes(original: String? = null, startSuffix: Int): String {
+        var k = startSuffix
+        var name = original ?: "split$k"
+        while (isUsed(name)) {
+            name = original ?: "split"
+            name = "${name}${k++}"
         }
+
+        return name
     }
 
-    return removeResult.df.insertImpl(toInsert)
+    return if (preferredName == null) {
+        generateNameVariationByTryingNumericSuffixes(startSuffix = 1)
+    } else {
+        generateNameVariationByTryingNumericSuffixes(preferredName, 1)
+    }
 }
 
 internal fun String.splitDefault() = split(",").map { it.trim() }
diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/split.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/split.kt
@@ -4,12 +4,16 @@ import io.kotest.matchers.shouldBe
 import org.jetbrains.kotlinx.dataframe.AnyFrame
 import org.jetbrains.kotlinx.dataframe.DataFrame
 import org.jetbrains.kotlinx.dataframe.hasNulls
+import org.jetbrains.kotlinx.dataframe.impl.DataRowImpl
 import org.jetbrains.kotlinx.dataframe.type
 import org.junit.Test
 import kotlin.reflect.typeOf
 
 class SplitTests {
 
+    val stringPairDf = dataFrameOf("first", "second")("22-65", "22-66")
+    val listPairDf = dataFrameOf("first", "second")(listOf("22", "65"), listOf("22", "66"))
+
     @Test
     fun `split with default`() {
         val recentDelays = listOf(listOf(23, 47), listOf(), listOf(24, 43, 87), listOf(13), listOf(67, 32)).toColumn("RecentDelays")
@@ -60,4 +64,105 @@ class SplitTests {
             3, emptyList<Int>(), emptyList<Int>()
         )
     }
+
+    @Test
+    fun `split string by delimiter inward`() {
+        val res = stringPairDf.split("first", "second").by("-").inward("left", "right")
+
+        res shouldBe dataFrameOf(
+            columnOf(columnOf("22") named "left", columnOf("65") named "right") named "first",
+            columnOf(columnOf("22") named "left", columnOf("66") named "right") named "second"
+        )
+    }
+
+    @Test
+    fun `split string by delimiter into columns with suffixes`() {
+        val res = stringPairDf.split("first", "second").by("-").into("left", "right")
+
+        res shouldBe dataFrameOf(
+            columnOf("22") named "left",
+            columnOf("65") named "right",
+            columnOf("22") named "left1",
+            columnOf("66") named "right1"
+        )
+    }
+
+    @Test
+    fun `split list inward with autogenerated names`() {
+        val res = listPairDf.split { "first"<List<String>>() and "second"<List<String>>() }.inward()
+
+        res shouldBe dataFrameOf(
+            columnOf(columnOf("22") named "split1", columnOf("65") named "split2") named "first",
+            columnOf(columnOf("22") named "split1", columnOf("66") named "split2") named "second"
+        )
+    }
+
+    @Test
+    fun `split list into with autogenerated names`() {
+        val res = listPairDf.split { "first"<List<String>>() and "second"<List<String>>() }.into()
+
+        res shouldBe dataFrameOf(
+            columnOf("22") named "split1",
+            columnOf("65") named "split2",
+            columnOf("22") named "split3",
+            columnOf("66") named "split4"
+        )
+    }
+
+    @Test
+    fun `sequence of splits with autogenerated names`() {
+        var res = listPairDf.split { "first"<List<String>>() }.into()
+        res = res.split { "second"<List<String>>() }.into()
+
+        res shouldBe dataFrameOf(
+            columnOf("22") named "split1",
+            columnOf("65") named "split2",
+            columnOf("22") named "split3",
+            columnOf("66") named "split4"
+        )
+    }
+
+    @Test
+    fun `split column group inward`() {
+        val df = stringPairDf.group("first", "second").into("group")
+
+        // Note: this operation replaces original columns in group so there is no name conflict
+        val res = df.split { "group"<DataRowImpl<*>>() }
+            .by { it -> listOf(it[1], it[0]) } // swap columns
+            .inward("first", "second") // no name conflict
+
+        res shouldBe dataFrameOf(
+            columnOf(columnOf("22-66") named "first", columnOf("22-65") named "second") named "group"
+        )
+    }
+
+    @Test
+    fun `split column group into hierarchy with correct names`() {
+        val df = dataFrameOf(
+            columnOf(
+                columnOf("a") named "first",
+                columnOf(
+                    columnOf("b") named "first",
+                    columnOf("c") named "second"
+                ) named "nestedGroup"
+            ) named "topLevelGroup",
+            columnOf("d") named "first",
+        )
+
+        val topLevelGroup by columnGroup()
+        val nestedGroup by topLevelGroup.columnGroup()
+
+        val res = df.split { nestedGroup }
+            .by { it -> listOf(it[0], it[1]) }
+            .into("first", "second") // name conflict
+
+        res shouldBe dataFrameOf(
+            columnOf(
+                columnOf("a") named "first",
+                columnOf("b") named "first1",
+                columnOf("c") named "second"
+            ) named "topLevelGroup",
+            columnOf("d") named "first",
+        )
+    }
 }