Skip to content

Commit a014fd5

Browse files
authored
Merge pull request #240 from ermolenkodev/224
Fix new columns names generation during split operation (#224)
2 parents 4ff8406 + c1d4cb6 commit a014fd5

File tree

2 files changed

+152
-13
lines changed
  • core/src
    • main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api
    • test/kotlin/org/jetbrains/kotlinx/dataframe/api

2 files changed

+152
-13
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/split.kt

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ import org.jetbrains.kotlinx.dataframe.AnyFrame
44
import org.jetbrains.kotlinx.dataframe.DataFrame
55
import org.jetbrains.kotlinx.dataframe.api.SplitWithTransform
66
import org.jetbrains.kotlinx.dataframe.api.rows
7+
import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
78
import org.jetbrains.kotlinx.dataframe.columns.ColumnWithPath
89
import org.jetbrains.kotlinx.dataframe.impl.ColumnDataCollector
910
import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnWithPath
1011
import org.jetbrains.kotlinx.dataframe.impl.createDataCollector
11-
import org.jetbrains.kotlinx.dataframe.impl.nameGenerator
1212
import org.jetbrains.kotlinx.dataframe.nrow
1313

1414
internal fun valueToList(value: Any?, splitStrings: Boolean = true): List<Any?> = when (value) {
@@ -25,9 +25,13 @@ internal fun <T, C, R> splitImpl(
2525
val nrow = clause.df.nrow
2626

2727
val removeResult = clause.df.removeImpl(columns = clause.columns)
28-
val nameGenerator = removeResult.df.nameGenerator()
2928

30-
val toInsert = removeResult.removedColumns.flatMap { node ->
29+
// As we insert multiple columns at once it's possible to encounter a name conflict within a batch of a new columns
30+
// that's why an old school mutable list is used here to check for name conflicts in intermediate steps of generation
31+
// of columns to insert
32+
val columnsToInsert = mutableListOf<ColumnToInsert>()
33+
34+
removeResult.removedColumns.forEach { node ->
3135

3236
val column = node.toColumnWithPath<C>()
3337
val columnCollectors = mutableListOf<ColumnDataCollector>()
@@ -46,22 +50,52 @@ internal fun <T, C, R> splitImpl(
4650
columnCollectors[j].add(clause.default)
4751
}
4852

49-
var names = columnNamesGenerator(column, columnCollectors.size)
50-
if (names.size < columnCollectors.size) {
51-
names = names + (1..(columnCollectors.size - names.size)).map { "split$it" }
52-
}
53+
val names = columnNamesGenerator(column, columnCollectors.size)
54+
val sourcePath = node.pathFromRoot()
55+
56+
columnCollectors.forEachIndexed { i, col ->
57+
val preferredName = names.getOrNull(i)
5358

54-
columnCollectors.mapIndexed { i, col ->
59+
val pathToInsert = if (clause.inward) sourcePath else sourcePath.dropLast(1)
60+
val name = generateUnusedName(removeResult.df, preferredName, pathToInsert, columnsToInsert)
61+
62+
val path = pathToInsert + name
5563

56-
val name = nameGenerator.addUnique(names[i])
57-
val sourcePath = node.pathFromRoot()
58-
val path = if (clause.inward) sourcePath + name else sourcePath.dropLast(1) + name
5964
val data = col.toColumn(name)
60-
ColumnToInsert(path, data, node)
65+
columnsToInsert.add(ColumnToInsert(path, data, node))
66+
}
67+
}
68+
69+
return removeResult.df.insertImpl(columnsToInsert)
70+
}
71+
72+
internal fun generateUnusedName(
73+
df: DataFrame<*>,
74+
preferredName: String?,
75+
insertPath: ColumnPath,
76+
columnsToBeInserted: List<ColumnToInsert>
77+
): String {
78+
// check if column with this name already exists in the df in the same position in the hierarchy,
79+
// or we already have a column with this name in the list of columns to be inserted to the same position in the hierarchy
80+
fun isUsed(name: String) =
81+
df.getColumnOrNull(insertPath + name) != null || columnsToBeInserted.any { it.insertionPath == insertPath + name }
82+
83+
fun generateNameVariationByTryingNumericSuffixes(original: String? = null, startSuffix: Int): String {
84+
var k = startSuffix
85+
var name = original ?: "split$k"
86+
while (isUsed(name)) {
87+
name = original ?: "split"
88+
name = "${name}${k++}"
6189
}
90+
91+
return name
6292
}
6393

64-
return removeResult.df.insertImpl(toInsert)
94+
return if (preferredName == null) {
95+
generateNameVariationByTryingNumericSuffixes(startSuffix = 1)
96+
} else {
97+
generateNameVariationByTryingNumericSuffixes(preferredName, 1)
98+
}
6599
}
66100

67101
internal fun String.splitDefault() = split(",").map { it.trim() }

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/split.kt

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,16 @@ import io.kotest.matchers.shouldBe
44
import org.jetbrains.kotlinx.dataframe.AnyFrame
55
import org.jetbrains.kotlinx.dataframe.DataFrame
66
import org.jetbrains.kotlinx.dataframe.hasNulls
7+
import org.jetbrains.kotlinx.dataframe.impl.DataRowImpl
78
import org.jetbrains.kotlinx.dataframe.type
89
import org.junit.Test
910
import kotlin.reflect.typeOf
1011

1112
class SplitTests {
1213

14+
val stringPairDf = dataFrameOf("first", "second")("22-65", "22-66")
15+
val listPairDf = dataFrameOf("first", "second")(listOf("22", "65"), listOf("22", "66"))
16+
1317
@Test
1418
fun `split with default`() {
1519
val recentDelays = listOf(listOf(23, 47), listOf(), listOf(24, 43, 87), listOf(13), listOf(67, 32)).toColumn("RecentDelays")
@@ -60,4 +64,105 @@ class SplitTests {
6064
3, emptyList<Int>(), emptyList<Int>()
6165
)
6266
}
67+
68+
@Test
69+
fun `split string by delimiter inward`() {
70+
val res = stringPairDf.split("first", "second").by("-").inward("left", "right")
71+
72+
res shouldBe dataFrameOf(
73+
columnOf(columnOf("22") named "left", columnOf("65") named "right") named "first",
74+
columnOf(columnOf("22") named "left", columnOf("66") named "right") named "second"
75+
)
76+
}
77+
78+
@Test
79+
fun `split string by delimiter into columns with suffixes`() {
80+
val res = stringPairDf.split("first", "second").by("-").into("left", "right")
81+
82+
res shouldBe dataFrameOf(
83+
columnOf("22") named "left",
84+
columnOf("65") named "right",
85+
columnOf("22") named "left1",
86+
columnOf("66") named "right1"
87+
)
88+
}
89+
90+
@Test
91+
fun `split list inward with autogenerated names`() {
92+
val res = listPairDf.split { "first"<List<String>>() and "second"<List<String>>() }.inward()
93+
94+
res shouldBe dataFrameOf(
95+
columnOf(columnOf("22") named "split1", columnOf("65") named "split2") named "first",
96+
columnOf(columnOf("22") named "split1", columnOf("66") named "split2") named "second"
97+
)
98+
}
99+
100+
@Test
101+
fun `split list into with autogenerated names`() {
102+
val res = listPairDf.split { "first"<List<String>>() and "second"<List<String>>() }.into()
103+
104+
res shouldBe dataFrameOf(
105+
columnOf("22") named "split1",
106+
columnOf("65") named "split2",
107+
columnOf("22") named "split3",
108+
columnOf("66") named "split4"
109+
)
110+
}
111+
112+
@Test
113+
fun `sequence of splits with autogenerated names`() {
114+
var res = listPairDf.split { "first"<List<String>>() }.into()
115+
res = res.split { "second"<List<String>>() }.into()
116+
117+
res shouldBe dataFrameOf(
118+
columnOf("22") named "split1",
119+
columnOf("65") named "split2",
120+
columnOf("22") named "split3",
121+
columnOf("66") named "split4"
122+
)
123+
}
124+
125+
@Test
126+
fun `split column group inward`() {
127+
val df = stringPairDf.group("first", "second").into("group")
128+
129+
// Note: this operation replaces original columns in group so there is no name conflict
130+
val res = df.split { "group"<DataRowImpl<*>>() }
131+
.by { it -> listOf(it[1], it[0]) } // swap columns
132+
.inward("first", "second") // no name conflict
133+
134+
res shouldBe dataFrameOf(
135+
columnOf(columnOf("22-66") named "first", columnOf("22-65") named "second") named "group"
136+
)
137+
}
138+
139+
@Test
140+
fun `split column group into hierarchy with correct names`() {
141+
val df = dataFrameOf(
142+
columnOf(
143+
columnOf("a") named "first",
144+
columnOf(
145+
columnOf("b") named "first",
146+
columnOf("c") named "second"
147+
) named "nestedGroup"
148+
) named "topLevelGroup",
149+
columnOf("d") named "first",
150+
)
151+
152+
val topLevelGroup by columnGroup()
153+
val nestedGroup by topLevelGroup.columnGroup()
154+
155+
val res = df.split { nestedGroup }
156+
.by { it -> listOf(it[0], it[1]) }
157+
.into("first", "second") // name conflict
158+
159+
res shouldBe dataFrameOf(
160+
columnOf(
161+
columnOf("a") named "first",
162+
columnOf("b") named "first1",
163+
columnOf("c") named "second"
164+
) named "topLevelGroup",
165+
columnOf("d") named "first",
166+
)
167+
}
63168
}

0 commit comments

Comments
 (0)