Skip to content

statistics documentation update #1165

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import java.math.BigDecimal
import java.math.BigInteger
import kotlin.reflect.KClass
import kotlin.reflect.KType
import kotlin.reflect.full.isSubtypeOf
import kotlin.reflect.full.withNullability
import kotlin.reflect.typeOf

Expand Down Expand Up @@ -224,10 +225,24 @@ internal fun Sequence<Number?>.convertToUnifiedNumberType(
"Cannot find unified number type of types: ${types.joinToString { renderType(it) }}",
)
}
val converter = createConverter(typeOf<Number>(), commonNumberType)!! as (Number) -> Number?
return map {
if (it == null) return@map null
converter(it) ?: error("Can not convert $it to $commonNumberType")
require(commonNumberType.isSubtypeOf(typeOf<Number?>())) {
"Cannot convert numbers to $commonNumberType; it is not a subtype of Number?"
}
return when (commonNumberType) {
nothingType -> {
require(null !in this) { "Cannot unify numbers to Nothing; it contains nulls" }
this
}

nullableNothingType -> this

else -> {
val converter = createConverter(typeOf<Number>(), commonNumberType)!! as (Number) -> Number?
this.map {
if (it == null) return@map null
converter(it) ?: error("Can not convert $it to $commonNumberType")
}
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import org.jetbrains.kotlinx.dataframe.api.mean
import org.jetbrains.kotlinx.dataframe.api.meanFor
import org.jetbrains.kotlinx.dataframe.api.meanOf
import org.jetbrains.kotlinx.dataframe.api.median
import org.jetbrains.kotlinx.dataframe.api.medianBy
import org.jetbrains.kotlinx.dataframe.api.medianFor
import org.jetbrains.kotlinx.dataframe.api.medianOf
import org.jetbrains.kotlinx.dataframe.api.min
Expand All @@ -43,6 +44,7 @@ import org.jetbrains.kotlinx.dataframe.api.minFor
import org.jetbrains.kotlinx.dataframe.api.minOf
import org.jetbrains.kotlinx.dataframe.api.minOrNull
import org.jetbrains.kotlinx.dataframe.api.percentile
import org.jetbrains.kotlinx.dataframe.api.percentileBy
import org.jetbrains.kotlinx.dataframe.api.percentileFor
import org.jetbrains.kotlinx.dataframe.api.percentileOf
import org.jetbrains.kotlinx.dataframe.api.pivot
Expand Down Expand Up @@ -179,7 +181,7 @@ class Analyze : TestBase() {
// SampleStart
df.sum() // sum of values per every numeric column
df.sum { age and weight } // sum of all values in `age` and `weight`
df.sumFor { age and weight } // sum of values per `age` and `weight` separately
df.sumFor(skipNaN = true) { age and weight } // sum of values per `age` and `weight` separately
df.sumOf { (weight ?: 0) / age } // sum of expression evaluated for every row
// SampleEnd
}
Expand All @@ -190,7 +192,7 @@ class Analyze : TestBase() {
// SampleStart
df.min() // min of values per every comparable column
df.min { age and weight } // min of all values in `age` and `weight`
df.minFor { age and weight } // min of values per `age` and `weight` separately
df.minFor(skipNaN = true) { age and weight } // min of values per `age` and `weight` separately
df.minOf { (weight ?: 0) / age } // min of expression evaluated for every row
df.minBy { age } // DataRow with minimal `age`
// SampleEnd
Expand All @@ -214,8 +216,9 @@ class Analyze : TestBase() {
// SampleStart
df.median() // median of values per every comparable column
df.median { age and weight } // median of all values in `age` and `weight`
df.medianFor { age and weight } // median of values per `age` and `weight` separately
df.medianFor(skipNaN = true) { age and weight } // median of values per `age` and `weight` separately
df.medianOf { (weight ?: 0) / age } // median of expression evaluated for every row
df.medianBy { age } // DataRow where the median age lies (lower-median for an even number of values)
// SampleEnd
}

Expand All @@ -235,10 +238,11 @@ class Analyze : TestBase() {
@TransformDataFrameExpressions
fun percentileModes() {
// SampleStart
df.percentile(25.0) // percentile of values per every comparable column
df.percentile(25.0) { age and weight } // percentile of all values in `age` and `weight`
df.percentileFor(25.0) { age and weight } // percentile of values per `age` and `weight` separately
df.percentileOf(25.0) { (weight ?: 0) / age } // percentile of expression evaluated for every row
df.percentile(25.0) // 25th percentile of values per every comparable column
df.percentile(75.0) { age and weight } // 75th percentile of all values in `age` and `weight`
df.percentileFor(50.0, skipNaN = true) { age and weight } // 50th percentile of values per `age` and `weight` separately
df.percentileOf(75.0) { (weight ?: 0) / age } // 75th percentile of expression evaluated for every row
df.percentileBy(25.0) { age } // DataRow where the 25th percentile of `age` lies (index rounded using R3)
// SampleEnd
}

Expand All @@ -247,9 +251,9 @@ class Analyze : TestBase() {
fun percentileAggregations() {
// SampleStart
df.percentile(25.0)
df.age.percentile(25.0)
df.groupBy { city }.percentile(25.0)
df.pivot { city }.percentile(25.0)
df.age.percentile(75.0)
df.groupBy { city }.percentile(50.0)
df.pivot { city }.percentile(75.0)
df.pivot { city }.groupBy { name.lastName }.percentile(25.0)
// SampleEnd
}
Expand All @@ -259,8 +263,8 @@ class Analyze : TestBase() {
fun meanModes() {
// SampleStart
df.mean() // mean of values per every numeric column
df.mean(skipNaN = true) { age and weight } // mean of all values in `age` and `weight`, skips NA
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NA
df.mean { age and weight } // mean of all values in `age` and `weight`
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NaN
df.meanOf { (weight ?: 0) / age } // median of expression evaluated for every row
// SampleEnd
}
Expand All @@ -283,7 +287,7 @@ class Analyze : TestBase() {
// SampleStart
df.std() // std of values per every numeric column
df.std { age and weight } // std of all values in `age` and `weight`
df.stdFor { age and weight } // std of values per `age` and `weight` separately, skips NA
df.stdFor(skipNaN = true) { age and weight } // std of values per `age` and `weight` separately, skips NA
df.stdOf { (weight ?: 0) / age } // std of expression evaluated for every row
// SampleEnd
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import io.kotest.matchers.doubles.shouldBeNaN
import io.kotest.matchers.floats.shouldBeNaN
import io.kotest.matchers.shouldBe
import io.kotest.matchers.string.shouldContain
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.columnOf
import org.jetbrains.kotlinx.dataframe.api.dataFrameOf
import org.jetbrains.kotlinx.dataframe.api.isEmpty
Expand All @@ -14,7 +16,9 @@ import org.jetbrains.kotlinx.dataframe.api.sum
import org.jetbrains.kotlinx.dataframe.api.sumFor
import org.jetbrains.kotlinx.dataframe.api.sumOf
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
import org.jetbrains.kotlinx.dataframe.impl.nullableNothingType
import org.junit.Test
import kotlin.reflect.typeOf

class SumTests {

Expand Down Expand Up @@ -46,6 +50,28 @@ class SumTests {
df.sumOf { value() } shouldBe expected
}

@Test
fun `empty column with types`() {
val emptyIntCol by columnOf<Int?>(null, null)
emptyIntCol.sum() shouldBe 0

// empty column with Number type
val emptyNumberColumn = DataColumn.createValueColumn<Number?>(
"emptyNumberColumn",
listOf(null, null),
typeOf<Number?>(),
)
emptyNumberColumn.sum() shouldBe 0.0

// empty column with nullable Nothing type
val emptyNothingColumn = DataColumn.createValueColumn(
"emptyNothingColumn",
listOf(null, null),
nullableNothingType,
)
emptyNothingColumn.cast<Number?>().sum() shouldBe 0.0
}

@Test
fun `test multiple columns`() {
val value1 by columnOf(1, 2, 3)
Expand Down
1 change: 1 addition & 0 deletions docs/StardustDocs/d.tree
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
<toc-element topic="DataRow.md"/>
</toc-element>
<toc-element topic="nanAndNa.md"/>
<toc-element topic="numberUnification.md"/>
<toc-element topic="operations.md"/>
<toc-element toc-title="Operations">
<toc-element topic="create.md">
Expand Down
16 changes: 8 additions & 8 deletions docs/StardustDocs/topics/DataRow.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,21 +83,21 @@ Row condition signature: ```DataRow.(DataRow) -> Boolean```
<snippet id="rowStatistics">

The following [statistics](summaryStatistics.md) are available for `DataRow`:
* `rowMax`
* `rowMin`
* `rowSum`
* `rowMean`
* `rowStd`
* `rowMedian`

These statistics will be applied only to values of appropriate types and incompatible values will be ignored.
For example, if [`DataFrame`](DataFrame.md) has columns of type `String` and `Int`, `rowSum()` will successfully compute sum of `Int` values in a row and ignore `String` values.
These statistics will be applied only to values of appropriate types, and incompatible values will be ignored.
For example, if a [dataframe](DataFrame.md) has columns of types `String` and `Int`,
`rowSum()` will compute the sum of the `Int` values in the row and ignore `String` values.

To apply statistics only to values of particular type use `-Of` versions:
* `rowMaxOf<T>`
* `rowMinOf<T>`
To apply statistics only to values of a particular type use `-Of` versions:
* `rowSumOf<T>`
* `rowMeanOf<T>`
* `rowStdOf<T>`
* `rowMinOf<T>`
* `rowMaxOf<T>`
* `rowMedianOf<T>`
* `rowPercentileOf<T>`

</snippet>
4 changes: 3 additions & 1 deletion docs/StardustDocs/topics/columnStatistics.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
[//]: # (title: Column statistics)

// TODO
Statistics on columns are described:
- [here](summaryStatistics.md) for summary statistics, like [sum](sum.md) and [mean](mean.md)
- [here](columnStatistics.md) for cumulative statistics, like [cumSum](cumSum.md)
36 changes: 31 additions & 5 deletions docs/StardustDocs/topics/mean.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,28 @@

<!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->

Computes the mean of values.
Computes the [mean (average)](https://en.wikipedia.org/wiki/Arithmetic_mean) of values.

Is available for numeric columns. Computed value has type `Double`.
Use `skipNA` flag to skip [`NA` values](nanAndNa.md#na) (`null` and `NaN`).
`null` values are ignored.

All primitive numeric types are supported: `Byte`, `Short`, `Int`, `Long`, `Float`, and `Double`.

`mean` also supports the "mixed" `Number` type, as long as the column consists only of the aforementioned
primitive numbers.
The numbers are automatically converted to a [common type](numberUnification.md) for the operation.

The return type is always `Double`; `Double.NaN` for empty columns.

All operations on `Double`/`Float`/`Number` have the `skipNaN` option, which is
set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
When it's set to `true`, `NaN` values are ignored.

<!---FUN meanModes-->

```kotlin
df.mean() // mean of values per every numeric column
df.mean(skipNaN = true) { age and weight } // mean of all values in `age` and `weight`, skips NA
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NA
df.mean { age and weight } // mean of all values in `age` and `weight`
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NaN
df.meanOf { (weight ?: 0) / age } // median of expression evaluated for every row
```

Expand All @@ -31,3 +42,18 @@ df.pivot { city }.groupBy { name.lastName }.mean()
<!---END-->

See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.

### Type Conversion

The following automatic type conversions are performed for the `mean` operation:

| Conversion | Result for Empty Input |
|----------------------------------------------------------------------------|------------------------|
| Int -> Double | Double.NaN |
| Byte -> Double | Double.NaN |
| Short -> Double | Double.NaN |
| Long -> Double | Double.NaN |
| Double -> Double | Double.NaN |
| Float -> Double | Double.NaN |
| Number -> Conversion([Common number type](numberUnification.md)) -> Double | Double.NaN |
| Nothing -> Double | Double.NaN |
42 changes: 39 additions & 3 deletions docs/StardustDocs/topics/median.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,37 @@

<!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->

Computes the median of values.
Computes the [median](https://en.wikipedia.org/wiki/Median) of values.

Is available for `Comparable` columns. [`NA` values](nanAndNa.md#na) (`null` and `NaN`) are ignored.
This is also called the "middle" of a sorted list, the "50th [percentile](percentile.md)", or
the 2-[quantile](https://en.wikipedia.org/wiki/Quantile).

`null` values in the input are ignored.
The operations either throw an exception when the input is empty (after filtering `null` or `NaN` values),
or they return `null` when using the `-orNull` overloads.

All primitive numeric types are supported: `Byte`, `Short`, `Int`, `Long`, `Float`, and `Double`,
but no mix of different number types.
In these cases, the return type is always `Double?`.
When the number of values is even, the median is the average of the two middle values.

The operation is also available for self-comparable columns
(so columns of type `T : Comparable<T>`, like `DateTime`, `String`, etc.)
In this case, the return type remains `T?`.
When the number of values is even, the median is the low of the two middle values.

All operations on `Double`/`Float` have the `skipNaN` option, which is
set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
When it's set to `true`, `NaN` values are ignored.

<!---FUN medianModes-->

```kotlin
df.median() // median of values per every comparable column
df.median { age and weight } // median of all values in `age` and `weight`
df.medianFor { age and weight } // median of values per `age` and `weight` separately
df.medianFor(skipNaN = true) { age and weight } // median of values per `age` and `weight` separately
df.medianOf { (weight ?: 0) / age } // median of expression evaluated for every row
df.medianBy { age } // DataRow where the median age lies (lower-median for an even number of values)
```

<!---END-->
Expand All @@ -30,3 +50,19 @@ df.pivot { city }.groupBy { name.lastName }.median()
<!---END-->

See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.

### Type Conversion

The following automatic type conversions are performed for the `median` operation.
(Note that `null` only appears in the return type when using `-orNull` overloads).

| Conversion | Result for Empty Input |
|--------------------------------|------------------------|
| T -> T where T : Comparable<T> | null |
| Int -> Double | null |
| Byte -> Double | null |
| Short -> Double | null |
| Long -> Double | null |
| Double -> Double | null |
| Float -> Double | null |
| Nothing -> Nothing | null |
33 changes: 29 additions & 4 deletions docs/StardustDocs/topics/minmax.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,26 @@

<!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->

Computes the minimum / maximum of values.
Computes the [minimum / maximum](https://en.wikipedia.org/wiki/Maximum_and_minimum) of values.

Is available for [`Comparable`](https://kotlinlang.org/api/latest/jvm/stdlib/kotlin/-comparable/) columns.
[`NA` values](nanAndNa.md#na) (`null` and `NaN`) are ignored.
`null` values in the input are ignored.
The operations either throw an exception when the input is empty (after filtering `null` or `NaN` values),
or they return `null` when using the `-orNull` overloads.

They are available for self-comparable columns
(so columns of type `T : Comparable<T>`, like `DateTime`, `String`, `Int`, etc.)
which includes all primitive number columns, but no mix of different number types.

All operations on `Double`/`Float` have the `skipNaN` option, which is
set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
When it's set to `true`, `NaN` values are ignored.

<!---FUN minmaxModes-->

```kotlin
df.min() // min of values per every comparable column
df.min { age and weight } // min of all values in `age` and `weight`
df.minFor { age and weight } // min of values per `age` and `weight` separately
df.minFor(skipNaN = true) { age and weight } // min of values per `age` and `weight` separately
df.minOf { (weight ?: 0) / age } // min of expression evaluated for every row
df.minBy { age } // DataRow with minimal `age`
```
Expand All @@ -32,3 +41,19 @@ df.pivot { city }.groupBy { name.lastName }.min()
<!---END-->

See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.

### Type Conversion

The following automatic type conversions are performed for the `min` and `max` operations.
(Note that `null` only appears in the return type when using `-orNull` overloads).

| Conversion | Result for Empty Input |
|--------------------------------|------------------------|
| T -> T where T : Comparable<T> | null |
| Int -> Int | null |
| Byte -> Byte | null |
| Short -> Short | null |
| Long -> Long | null |
| Double -> Double | null |
| Float -> Float | null |
| Nothing -> Nothing | null |
3 changes: 3 additions & 0 deletions docs/StardustDocs/topics/numberUnification.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[//]: # (title: Number Unification)

// TODO
Loading