Skip to content

All except simplification: Option 1 #1036

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions core/api/core.api
Original file line number Diff line number Diff line change
Expand Up @@ -9410,6 +9410,9 @@ public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/Ds
public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/DslGrammarTemplateColumnsSelectionDsl$DslGrammarTemplate$COLUMN_SET_PART {
}

public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/DslGrammarTemplateColumnsSelectionDsl$DslGrammarTemplate$ColumNameDef {
}

public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/DslGrammarTemplateColumnsSelectionDsl$DslGrammarTemplate$ColumnDef {
}

Expand Down Expand Up @@ -9437,10 +9440,13 @@ public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/Ds
public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/DslGrammarTemplateColumnsSelectionDsl$DslGrammarTemplate$ColumnKindRef {
}

public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/DslGrammarTemplateColumnsSelectionDsl$DslGrammarTemplate$ColumnNoAccessorDef {
public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/DslGrammarTemplateColumnsSelectionDsl$DslGrammarTemplate$ColumnNameRef {
}

public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/DslGrammarTemplateColumnsSelectionDsl$DslGrammarTemplate$ColumnNoPathDef {
}

public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/DslGrammarTemplateColumnsSelectionDsl$DslGrammarTemplate$ColumnNoAccessorRef {
public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/DslGrammarTemplateColumnsSelectionDsl$DslGrammarTemplate$ColumnNoPathRef {
}

public abstract interface class org/jetbrains/kotlinx/dataframe/documentation/DslGrammarTemplateColumnsSelectionDsl$DslGrammarTemplate$ColumnOrColumnSetDef {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,9 @@ public interface ColumnsSelectionDsl<out T> : // SingleColumn<DataRow<T>>
*
* {@include [DslGrammarTemplate.ColumnGroupDef]}
*
* {@include [DslGrammarTemplate.ColumnNoAccessorDef]}
* {@include [DslGrammarTemplate.ColumNameDef]}
*
* {@include [DslGrammarTemplate.ColumnNoPathDef]}
*
* {@include [DslGrammarTemplate.ColumnOrColumnSetDef]}
*
Expand Down Expand Up @@ -180,7 +182,7 @@ public interface ColumnsSelectionDsl<out T> : // SingleColumn<DataRow<T>>
*
* `| `{@include [AllExceptColumnsSelectionDsl.Grammar.PlainDslName]}**` { `**{@include [DslGrammarTemplate.ColumnsSelectorRef]}**` \}`**
*
* `| `{@include [AllExceptColumnsSelectionDsl.Grammar.PlainDslName]}**`(`**{@include [DslGrammarTemplate.ColumnRef]}**`,`**` ..`**`)`**
* `| `{@include [AllExceptColumnsSelectionDsl.Grammar.PlainDslName]}**`(`**{@include [DslGrammarTemplate.ColumnNoPathRef]}**`,`**` ..`**`)`**
*
* `| `{@include [DslGrammarTemplate.ColumnOrColumnSetRef]}` `{@include [AndColumnsSelectionDsl.Grammar.InfixName]}` [ `**`{`**` ] `{@include [DslGrammarTemplate.ColumnOrColumnSetRef]}` [ `**`\}`**` ] `
*
Expand Down Expand Up @@ -307,7 +309,7 @@ public interface ColumnsSelectionDsl<out T> : // SingleColumn<DataRow<T>>
*
* {@include [Indent]}`| `{@include [AllExceptColumnsSelectionDsl.Grammar.ColumnGroupName]}**` { `**{@include [DslGrammarTemplate.ColumnsSelectorRef]}**` \} `**
*
* {@include [Indent]}`| `{@include [AllExceptColumnsSelectionDsl.Grammar.ColumnGroupName]}**`(`**{@include [DslGrammarTemplate.ColumnNoAccessorRef]}**`,`**` ..`**`)`**
* {@include [Indent]}`| `{@include [AllExceptColumnsSelectionDsl.Grammar.ColumnGroupName]}**`(`**{@include [DslGrammarTemplate.ColumnNameRef]}**`,`**` ..`**`)`**
*
* {@include [Indent]}`| `{@include [AndColumnsSelectionDsl.Grammar.Name]}**` (`**`|`**`{ `**{@include [DslGrammarTemplate.ColumnOrColumnSetRef]}**` \}`**`|`**`)`**
*
Expand Down
133 changes: 53 additions & 80 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/allExcept.kt

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ public fun <T> Pivot<T>.groupBy(vararg columns: KProperty<*>): PivotGroupBy<T> =
public fun <T> Pivot<T>.groupByOther(): PivotGroupBy<T> {
val impl = this as PivotImpl<T>
val pivotColumns = df.getPivotColumnPaths(columns).toColumnSet()
return impl.toGroupedPivot(moveToTop = false) { allExcept(pivotColumns) }
return impl.toGroupedPivot(moveToTop = false) {
(this as DataFrame<T>).remove { pivotColumns }.columns().toColumnSet()
}
}

// endregion
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import org.jetbrains.kotlinx.dataframe.api.asSingleColumn
* ## ColumnSet
*
* Entity that can be resolved into a list of [columns][DataColumn].
* Unlike an actual "set", repeated columns are allowed.
* Just like [SingleColumn], this is a [ColumnsResolver].
*
* @see [SingleColumn]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,11 @@ public interface DslGrammarTemplateColumnsSelectionDsl {
*/
public interface ColumnGroupDef

/** `columnNoAccessor: `[`String`][String]` | `[`KProperty`][kotlin.reflect.KProperty]`<*> | `[`ColumnPath`][org.jetbrains.kotlinx.dataframe.columns.ColumnPath] */
public interface ColumnNoAccessorDef
/** `columnName: `[`String`][String]` | `[`KProperty`][kotlin.reflect.KProperty]`<*>` */
public interface ColumNameDef

/** `columnNoPath: `[`ColumnAccessor`][org.jetbrains.kotlinx.dataframe.columns.ColumnAccessor]` | `[`String`][String]` | `[`KProperty`][kotlin.reflect.KProperty]`<*>` */
public interface ColumnNoPathDef

/** `columnOrSet: `{@include [ColumnRef]}` | `{@include [ColumnSetRef]} */
public interface ColumnOrColumnSetDef
Expand Down Expand Up @@ -190,8 +193,11 @@ public interface DslGrammarTemplateColumnsSelectionDsl {
/** [`columnGroup`][ColumnGroupDef] */
public interface ColumnGroupRef

/** [`columnNoAccessor`][ColumnNoAccessorDef] */
public interface ColumnNoAccessorRef
/** [`columnName`][ColumNameDef] */
public interface ColumnNameRef

/** [`columnNoPath`][ColumnNoPathDef] */
public interface ColumnNoPathRef

/** [`columnOrSet`][ColumnOrColumnSetDef] */
public interface ColumnOrColumnSetRef
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.jetbrains.kotlinx.dataframe.impl.aggregation

import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.ColumnsSelector
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.aggregation.AggregateBody
Expand All @@ -9,6 +10,7 @@ import org.jetbrains.kotlinx.dataframe.api.PivotGroupBy
import org.jetbrains.kotlinx.dataframe.api.aggregate
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.firstOrNull
import org.jetbrains.kotlinx.dataframe.api.remove
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
import org.jetbrains.kotlinx.dataframe.impl.GroupByImpl
import org.jetbrains.kotlinx.dataframe.impl.api.aggregatePivot
Expand All @@ -33,10 +35,11 @@ internal data class PivotGroupByImpl<T>(
df.groups.firstOrNull()
?.getPivotColumnPaths(columns).orEmpty()
.let { pivotPaths ->
{
all().except(
pivotPaths.toColumnSet() and (df as GroupByImpl).keyColumnsInGroups.toColumnSet(),
)
return@let {
(this as AnyFrame)
.remove { pivotPaths.toColumnSet() and (df as GroupByImpl).keyColumnsInGroups.toColumnSet() }
.columns()
.toColumnSet()
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,20 @@ import org.jetbrains.kotlinx.dataframe.api.concat
import org.jetbrains.kotlinx.dataframe.api.dropNA
import org.jetbrains.kotlinx.dataframe.api.groupBy
import org.jetbrains.kotlinx.dataframe.api.map
import org.jetbrains.kotlinx.dataframe.api.remove
import org.jetbrains.kotlinx.dataframe.api.replace
import org.jetbrains.kotlinx.dataframe.api.with
import org.jetbrains.kotlinx.dataframe.columns.ColumnKind
import org.jetbrains.kotlinx.dataframe.columns.toColumnSet
import org.jetbrains.kotlinx.dataframe.impl.columns.asAnyFrameColumn
import org.jetbrains.kotlinx.dataframe.impl.columns.extractDataFrame
import org.jetbrains.kotlinx.dataframe.impl.getListType
import kotlin.reflect.typeOf

internal fun <T, C> DataFrame<T>.implodeImpl(dropNA: Boolean = false, columns: ColumnsSelector<T, C>): DataFrame<T> =
groupBy { allExcept(columns) }.updateGroups {
groupBy {
(this as DataFrame<T>).remove(columns).columns().toColumnSet()
}.updateGroups {
replace(columns).with { column ->
val (value, type) = when (column.kind()) {
ColumnKind.Value -> (if (dropNA) column.dropNA() else column).toList() to getListType(column.type())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ internal class ColumnAccessorImpl<T>(val path: ColumnPath) : ColumnAccessor<T> {
val col = df.getColumn<Any?>(colName, context.unresolvedColumnsPolicy) ?: return null
if (!col.isColumnGroup()) {
error(
"Cannot resolve column '${path.subList(0, i + 2).joinToString(".")}': " +
"Column '${path.subList(0, i + 1).joinToString(".")}' is not a column group.",
"Cannot resolve column '${path.subList(0, i + 2).joinToString("/")}': " +
"Column '${path.subList(0, i + 1).joinToString("/")}' is not a column group.",
)
} else {
col
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,9 @@ import org.jetbrains.kotlinx.dataframe.AnyRow
import org.jetbrains.kotlinx.dataframe.ColumnsContainer
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.asColumnGroup
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.name
import org.jetbrains.kotlinx.dataframe.api.pathOf
import org.jetbrains.kotlinx.dataframe.api.remove
import org.jetbrains.kotlinx.dataframe.api.replace
import org.jetbrains.kotlinx.dataframe.api.with
import org.jetbrains.kotlinx.dataframe.columns.BaseColumn
import org.jetbrains.kotlinx.dataframe.columns.ColumnAccessor
import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
Expand All @@ -30,12 +26,11 @@ import org.jetbrains.kotlinx.dataframe.columns.UnresolvedColumnsPolicy
import org.jetbrains.kotlinx.dataframe.columns.ValueColumn
import org.jetbrains.kotlinx.dataframe.columns.values
import org.jetbrains.kotlinx.dataframe.impl.DataFrameImpl
import org.jetbrains.kotlinx.dataframe.impl.asNullable
import org.jetbrains.kotlinx.dataframe.impl.columns.missing.MissingDataColumn
import org.jetbrains.kotlinx.dataframe.impl.columns.tree.ColumnPosition
import org.jetbrains.kotlinx.dataframe.impl.columns.tree.TreeNode
import org.jetbrains.kotlinx.dataframe.impl.columns.tree.collectTree
import org.jetbrains.kotlinx.dataframe.impl.columns.tree.getOrPut
import org.jetbrains.kotlinx.dataframe.impl.columns.tree.contains
import org.jetbrains.kotlinx.dataframe.impl.columns.tree.put
import org.jetbrains.kotlinx.dataframe.impl.columns.tree.topmostChildren
import org.jetbrains.kotlinx.dataframe.impl.equalsByElement
Expand Down Expand Up @@ -386,89 +381,52 @@ internal fun <T> List<ColumnWithPath<T>>.simplify(): List<ColumnWithPath<T>> {
return root.topmostChildren { it.data != null }.map { it.data!! }
}

/**
* Returns a new list of column paths, except the ones inside [columns].
* NOTE: The structure is not kept the same; if a column is removed, its parent will be removed as well, and
* all its siblings will be lifted out of the group. This also happens if a column is "removed" that does
* not exist in [this].
*/
internal fun List<ColumnWithPath<*>>.allColumnsExceptAndUnpack(
columns: Iterable<ColumnWithPath<*>>,
): List<ColumnWithPath<*>> {
if (isEmpty()) return emptyList()
val fullTree = collectTree()
columns.forEach {
var node = fullTree.getOrPut(it.path).asNullable()
node?.allChildren()?.forEach { it.data = null }
while (node != null) {
node.data = null
node = node.parent
}

private fun ColumnWithPath<*>.renderName(): String =
if (isColumnGroup()) {
"${path.joinToString()}/{${
columns().map { it.addPath() }.joinToString { it.renderName() }
}}"
} else {
path.joinToString()
}
val subtrees = fullTree.topmostChildren { it.data != null }
return subtrees.map { it.data!!.addPath(it.pathFromRoot()) }
}

/**
* Returns a new list of column paths, except the ones inside [columns].
* NOTE: ColumnGroups are adapted to keep their structure. If a column inside a column group is excepted, it will
* be removed from the group.
* Empty groups will be removed if [removeEmptyGroups]` == true`
* Returns a new list of distinct column paths, except the ones inside [columns].
* NOTE: There are no structural changes as removing nested columns is not allowed.
*
* @throws IllegalArgumentException if a nested column were to be removed
*/
internal fun List<ColumnWithPath<*>>.allColumnsExceptKeepingStructure(
columns: Iterable<ColumnWithPath<*>>,
removeEmptyGroups: Boolean = true,
): List<ColumnWithPath<*>> {
internal fun List<ColumnWithPath<*>>.removeAll(columnsToRemove: Iterable<ColumnWithPath<*>>): List<ColumnWithPath<*>> {
if (isEmpty()) return emptyList()
val fullTree = collectTree()
for (columnToExcept in columns) {
// grab the node representing the column from the tree
val nodeToExcept = fullTree.getOrPut(columnToExcept.path).asNullable()
if (nodeToExcept != null) {
// remove the children from the node (if it's a column group) and remove its data (the column itself)
nodeToExcept.allChildren().forEach { it.data = null }
nodeToExcept.data = null

// we need to update the data of the parent node(s) to reflect the removal of the column
if (nodeToExcept.parent != null) {
// we grab the data of the parent node, which should be a column group
// treat it as a DF to remove the column to except from it and
// convert it back to a column group
val current = nodeToExcept.parent.data as ColumnGroup<*>? ?: continue
val adjustedCurrent = current
.remove(nodeToExcept.name)
.asColumnGroup(current.name)
.addPath(current.path())

// remove the group if it's empty and removeEmptyGroups is true
// else, simply update the parent's data with the adjusted column group
nodeToExcept.parent.data =
if (adjustedCurrent.cols().isEmpty() && removeEmptyGroups) {
null
} else {
adjustedCurrent
}

// now we update the parent's parents recursively with new column group instances
var parent = nodeToExcept.parent.parent

@Suppress("UNNECESSARY_NOT_NULL_ASSERTION")
var currentNode = nodeToExcept.parent!!
while (parent != null) {
val parentData = parent.data as ColumnGroup<*>? ?: break
parent.data = parentData
.replace(currentNode.name).with { currentNode.data!! }
.asColumnGroup(parentData.name)
.addPath(parentData.path())

currentNode = parent
parent = parent.parent
}
}

// subtract columnsToRemove from this
val result = this.toMutableSet()
val columnPathsToRemove = columnsToRemove
.map { it.path }
.toSet()
.mapNotNull { toRemove ->
val removed = result.removeIf { it.path == toRemove }
if (removed) null else toRemove
}

// provide a helpful exception when a user tries to remove a nested column
if (columnPathsToRemove.isNotEmpty()) {
val fullTree = this.collectTree()
val nestedColumns = columnPathsToRemove.filter { it in fullTree }

if (nestedColumns.isNotEmpty()) {
throw IllegalArgumentException(
"Cannot exclude the nested columns '[${
nestedColumns.joinToString { it.joinToString() }
}]' from the column set '[${
this.joinToString { it.renderName() }
}]' with the except-functions. Use the `DataFrame<*>.remove { }` operation instead.",
)
}
}
val subtrees = fullTree.topmostChildren { it.data != null }
return subtrees.map { it.data!!.addPath(it.pathFromRoot()) }

return result.toList()
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,17 @@ internal fun <T> TreeNode<T>.getOrPut(path: ColumnPath, createData: (ColumnPath)
return node
}

/**
* Checks if a given column path exists from the current tree node.
*
* @param path The sequence of column path parts to check from the tree node.
* @return True if the specified path exists from the tree node, false otherwise.
*/
internal operator fun <T> TreeNode<T>.contains(path: ColumnPath): Boolean =
path.fold(this as TreeNode<T>?) { node, pathPart ->
node?.get(pathPart)
} != null

/**
* Traverses all children in the tree in depth-first order and returns the top-most nodes that satisfy
* [yieldCondition]. This means that if a node satisfies [yieldCondition], its children are not traversed, regardless of
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,12 @@ internal const val ALL_COLS_EXCEPT =
internal const val ALL_COLS_REPLACE = "allColsExcept { other }"
internal const val ALL_COLS_REPLACE_VARARG = "allColsExcept { others.toColumnSet() }"

internal const val ALL_COLS_EXCEPT_COLUMN_PATH =
"This overload is blocked because you cannot use `allColsExcept` for columns nested in this column group. " +
"Use a String to refer to a column instead, or use DataFrame.remove {} to remove nested columns."

internal const val ALL_EXCEPT_COLUMN_PATH =
"This overload is blocked because you cannot use `allExcept` for nested columns. " +
"Use a String to refer to a column instead, or use DataFrame.remove {} to remove nested columns."

// endregion
Loading
Loading