Skip to content

Commit 0aa4038

Browse files
committed
statistics documentation update
1 parent faf0d91 commit 0aa4038

File tree

13 files changed

+302
-66
lines changed

13 files changed

+302
-66
lines changed

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/samples/api/Analyze.kt

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import org.jetbrains.kotlinx.dataframe.api.mean
3535
import org.jetbrains.kotlinx.dataframe.api.meanFor
3636
import org.jetbrains.kotlinx.dataframe.api.meanOf
3737
import org.jetbrains.kotlinx.dataframe.api.median
38+
import org.jetbrains.kotlinx.dataframe.api.medianBy
3839
import org.jetbrains.kotlinx.dataframe.api.medianFor
3940
import org.jetbrains.kotlinx.dataframe.api.medianOf
4041
import org.jetbrains.kotlinx.dataframe.api.min
@@ -43,6 +44,7 @@ import org.jetbrains.kotlinx.dataframe.api.minFor
4344
import org.jetbrains.kotlinx.dataframe.api.minOf
4445
import org.jetbrains.kotlinx.dataframe.api.minOrNull
4546
import org.jetbrains.kotlinx.dataframe.api.percentile
47+
import org.jetbrains.kotlinx.dataframe.api.percentileBy
4648
import org.jetbrains.kotlinx.dataframe.api.percentileFor
4749
import org.jetbrains.kotlinx.dataframe.api.percentileOf
4850
import org.jetbrains.kotlinx.dataframe.api.pivot
@@ -179,7 +181,7 @@ class Analyze : TestBase() {
179181
// SampleStart
180182
df.sum() // sum of values per every numeric column
181183
df.sum { age and weight } // sum of all values in `age` and `weight`
182-
df.sumFor { age and weight } // sum of values per `age` and `weight` separately
184+
df.sumFor(skipNaN = true) { age and weight } // sum of values per `age` and `weight` separately
183185
df.sumOf { (weight ?: 0) / age } // sum of expression evaluated for every row
184186
// SampleEnd
185187
}
@@ -190,7 +192,7 @@ class Analyze : TestBase() {
190192
// SampleStart
191193
df.min() // min of values per every comparable column
192194
df.min { age and weight } // min of all values in `age` and `weight`
193-
df.minFor { age and weight } // min of values per `age` and `weight` separately
195+
df.minFor(skipNaN = true) { age and weight } // min of values per `age` and `weight` separately
194196
df.minOf { (weight ?: 0) / age } // min of expression evaluated for every row
195197
df.minBy { age } // DataRow with minimal `age`
196198
// SampleEnd
@@ -214,8 +216,9 @@ class Analyze : TestBase() {
214216
// SampleStart
215217
df.median() // median of values per every comparable column
216218
df.median { age and weight } // median of all values in `age` and `weight`
217-
df.medianFor { age and weight } // median of values per `age` and `weight` separately
219+
df.medianFor(skipNaN = true) { age and weight } // median of values per `age` and `weight` separately
218220
df.medianOf { (weight ?: 0) / age } // median of expression evaluated for every row
221+
df.medianBy { age } // DataRow where the median age lies (lower-median for an even number of values)
219222
// SampleEnd
220223
}
221224

@@ -235,10 +238,11 @@ class Analyze : TestBase() {
235238
@TransformDataFrameExpressions
236239
fun percentileModes() {
237240
// SampleStart
238-
df.percentile(25.0) // percentile of values per every comparable column
239-
df.percentile(25.0) { age and weight } // percentile of all values in `age` and `weight`
240-
df.percentileFor(25.0) { age and weight } // percentile of values per `age` and `weight` separately
241-
df.percentileOf(25.0) { (weight ?: 0) / age } // percentile of expression evaluated for every row
241+
df.percentile(25.0) // 25th percentile of values per every comparable column
242+
df.percentile(75.0) { age and weight } // 75th percentile of all values in `age` and `weight`
243+
df.percentileFor(50.0, skipNaN = true) { age and weight } // 50th percentile of values per `age` and `weight` separately
244+
df.percentileOf(75.0) { (weight ?: 0) / age } // 75th percentile of expression evaluated for every row
245+
df.percentileBy(25.0) { age } // DataRow where the 25th percentile of `age` lies (index rounded using R3)
242246
// SampleEnd
243247
}
244248

@@ -247,9 +251,9 @@ class Analyze : TestBase() {
247251
fun percentileAggregations() {
248252
// SampleStart
249253
df.percentile(25.0)
250-
df.age.percentile(25.0)
251-
df.groupBy { city }.percentile(25.0)
252-
df.pivot { city }.percentile(25.0)
254+
df.age.percentile(75.0)
255+
df.groupBy { city }.percentile(50.0)
256+
df.pivot { city }.percentile(75.0)
253257
df.pivot { city }.groupBy { name.lastName }.percentile(25.0)
254258
// SampleEnd
255259
}
@@ -259,8 +263,8 @@ class Analyze : TestBase() {
259263
fun meanModes() {
260264
// SampleStart
261265
df.mean() // mean of values per every numeric column
262-
df.mean(skipNaN = true) { age and weight } // mean of all values in `age` and `weight`, skips NA
263-
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NA
266+
df.mean { age and weight } // mean of all values in `age` and `weight`
267+
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NaN
264268
df.meanOf { (weight ?: 0) / age } // median of expression evaluated for every row
265269
// SampleEnd
266270
}
@@ -283,7 +287,7 @@ class Analyze : TestBase() {
283287
// SampleStart
284288
df.std() // std of values per every numeric column
285289
df.std { age and weight } // std of all values in `age` and `weight`
286-
df.stdFor { age and weight } // std of values per `age` and `weight` separately, skips NA
290+
df.stdFor(skipNaN = true) { age and weight } // std of values per `age` and `weight` separately, skips NA
287291
df.stdOf { (weight ?: 0) / age } // std of expression evaluated for every row
288292
// SampleEnd
289293
}

docs/StardustDocs/d.tree

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
<toc-element topic="DataRow.md"/>
3838
</toc-element>
3939
<toc-element topic="nanAndNa.md"/>
40+
<toc-element topic="numberUnification.md"/>
4041
<toc-element topic="operations.md"/>
4142
<toc-element toc-title="Operations">
4243
<toc-element topic="create.md">

docs/StardustDocs/topics/DataRow.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -83,21 +83,21 @@ Row condition signature: ```DataRow.(DataRow) -> Boolean```
8383
<snippet id="rowStatistics">
8484

8585
The following [statistics](summaryStatistics.md) are available for `DataRow`:
86-
* `rowMax`
87-
* `rowMin`
8886
* `rowSum`
8987
* `rowMean`
9088
* `rowStd`
91-
* `rowMedian`
9289

93-
These statistics will be applied only to values of appropriate types and incompatible values will be ignored.
94-
For example, if [`DataFrame`](DataFrame.md) has columns of type `String` and `Int`, `rowSum()` will successfully compute sum of `Int` values in a row and ignore `String` values.
90+
These statistics will be applied only to values of appropriate types, and incompatible values will be ignored.
91+
For example, if a [dataframe](DataFrame.md) has columns of types `String` and `Int`,
92+
`rowSum()` will compute the sum of the `Int` values in the row and ignore `String` values.
9593

96-
To apply statistics only to values of particular type use `-Of` versions:
97-
* `rowMaxOf<T>`
98-
* `rowMinOf<T>`
94+
To apply statistics only to values of a particular type use `-Of` versions:
9995
* `rowSumOf<T>`
10096
* `rowMeanOf<T>`
97+
* `rowStdOf<T>`
98+
* `rowMinOf<T>`
99+
* `rowMaxOf<T>`
101100
* `rowMedianOf<T>`
101+
* `rowPercentileOf<T>`
102102

103103
</snippet>
Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
[//]: # (title: Column statistics)
22

3-
// TODO
3+
Statistics on columns are described:
4+
- [here](summaryStatistics.md) for summary statistics, like [sum](sum.md) and [mean](mean.md)
5+
- [here](columnStatistics.md) for cumulative statistics, like [cumSum](cumSum.md)

docs/StardustDocs/topics/mean.md

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,28 @@
22

33
<!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->
44

5-
Computes the mean of values.
5+
Computes the [mean (average)](https://en.wikipedia.org/wiki/Arithmetic_mean) of values.
66

7-
Is available for numeric columns. Computed value has type `Double`.
8-
Use `skipNA` flag to skip [`NA` values](nanAndNa.md#na) (`null` and `NaN`).
7+
`null` values are ignored.
8+
9+
All primitive numeric types are supported: `Byte`, `Short`, `Int`, `Long`, `Float`, and `Double`.
10+
11+
`mean` also supports the "mixed" `Number` type, as long as the column consists only of the aforementioned
12+
primitive numbers.
13+
The numbers are automatically converted to a [common type](numberUnification.md) for the operation.
14+
15+
The return type is always `Double`; `Double.NaN` for empty columns.
16+
17+
All operations on `Double`/`Float`/`Number` have the `skipNaN` option, which is
18+
set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
19+
When it's set to `true`, `NaN` values are ignored.
920

1021
<!---FUN meanModes-->
1122

1223
```kotlin
1324
df.mean() // mean of values per every numeric column
14-
df.mean(skipNaN = true) { age and weight } // mean of all values in `age` and `weight`, skips NA
15-
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NA
25+
df.mean { age and weight } // mean of all values in `age` and `weight`
26+
df.meanFor(skipNaN = true) { age and weight } // mean of values per `age` and `weight` separately, skips NaN
1627
df.meanOf { (weight ?: 0) / age } // median of expression evaluated for every row
1728
```
1829

@@ -31,3 +42,18 @@ df.pivot { city }.groupBy { name.lastName }.mean()
3142
<!---END-->
3243

3344
See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.
45+
46+
### Type Conversion
47+
48+
The following automatic type conversions are performed for the `mean` operation:
49+
50+
| Conversion | skipNaN option |
51+
|----------------------------------------------------------------------------|----------------|
52+
| Int -> Double | |
53+
| Byte -> Double | |
54+
| Short -> Double | |
55+
| Long -> Double | |
56+
| Double -> Double | yes |
57+
| Float -> Double | yes |
58+
| Number -> Conversion([Common number type](numberUnification.md)) -> Double | yes |
59+
| Nothing / no values -> Double (NaN) | |

docs/StardustDocs/topics/median.md

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,37 @@
22

33
<!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->
44

5-
Computes the median of values.
5+
Computes the [median](https://en.wikipedia.org/wiki/Median) of values.
66

7-
Is available for `Comparable` columns. [`NA` values](nanAndNa.md#na) (`null` and `NaN`) are ignored.
7+
This is also called the "middle" of a sorted list, the "50th [percentile](percentile.md)", or
8+
the 2-[quantile](https://en.wikipedia.org/wiki/Quantile).
9+
10+
`null` values in the input are ignored.
11+
The operations either throw an exception when the input is empty (after filtering `null` or `NaN` values),
12+
or they return `null` when using the `-orNull` overloads.
13+
14+
All primitive numeric types are supported: `Byte`, `Short`, `Int`, `Long`, `Float`, and `Double`,
15+
but no mix of different number types.
16+
In these cases, the return type is always `Double?`.
17+
When the number of values is even, the median is the average of the two middle values.
18+
19+
The operation is also available for self-comparable columns
20+
(so columns of type `T : Comparable<T>`, like `DateTime`, `String`, etc.)
21+
In this case, the return type remains `T?`.
22+
When the number of values is even, the median is the low of the two middle values.
23+
24+
All operations on `Double`/`Float` have the `skipNaN` option, which is
25+
set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
26+
When it's set to `true`, `NaN` values are ignored.
827

928
<!---FUN medianModes-->
1029

1130
```kotlin
1231
df.median() // median of values per every comparable column
1332
df.median { age and weight } // median of all values in `age` and `weight`
14-
df.medianFor { age and weight } // median of values per `age` and `weight` separately
33+
df.medianFor(skipNaN = true) { age and weight } // median of values per `age` and `weight` separately
1534
df.medianOf { (weight ?: 0) / age } // median of expression evaluated for every row
35+
df.medianBy { age } // DataRow where the median age lies (lower-median for an even number of values)
1636
```
1737

1838
<!---END-->
@@ -30,3 +50,18 @@ df.pivot { city }.groupBy { name.lastName }.median()
3050
<!---END-->
3151

3252
See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.
53+
54+
### Type Conversion
55+
56+
The following automatic type conversions are performed for the `median` operation:
57+
58+
| Conversion | skipNaN option |
59+
|------------------------------------------|----------------|
60+
| T -> T? where T : Comparable<T> | |
61+
| Int -> Double? | |
62+
| Byte -> Double? | |
63+
| Short -> Double? | |
64+
| Long -> Double? | |
65+
| Double -> Double? | yes |
66+
| Float -> Double? | yes |
67+
| Nothing / no values -> Nothing? (`null`) | |

docs/StardustDocs/topics/minmax.md

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,26 @@
22

33
<!---IMPORT org.jetbrains.kotlinx.dataframe.samples.api.Analyze-->
44

5-
Computes the minimum / maximum of values.
5+
Computes the [minimum / maximum](https://en.wikipedia.org/wiki/Maximum_and_minimum) of values.
66

7-
Is available for [`Comparable`](https://kotlinlang.org/api/latest/jvm/stdlib/kotlin/-comparable/) columns.
8-
[`NA` values](nanAndNa.md#na) (`null` and `NaN`) are ignored.
7+
`null` values in the input are ignored.
8+
The operations either throw an exception when the input is empty (after filtering `null` or `NaN` values),
9+
or they return `null` when using the `-orNull` overloads.
10+
11+
They are available for self-comparable columns
12+
(so columns of type `T : Comparable<T>`, like `DateTime`, `String`, `Int`, etc.)
13+
which includes all primitive number columns, but no mix of different number types.
14+
15+
All operations on `Double`/`Float` have the `skipNaN` option, which is
16+
set to `false` by default. This means that if a `NaN` is present in the input, it will be propagated to the result.
17+
When it's set to `true`, `NaN` values are ignored.
918

1019
<!---FUN minmaxModes-->
1120

1221
```kotlin
1322
df.min() // min of values per every comparable column
1423
df.min { age and weight } // min of all values in `age` and `weight`
15-
df.minFor { age and weight } // min of values per `age` and `weight` separately
24+
df.minFor(skipNaN = true) { age and weight } // min of values per `age` and `weight` separately
1625
df.minOf { (weight ?: 0) / age } // min of expression evaluated for every row
1726
df.minBy { age } // DataRow with minimal `age`
1827
```
@@ -32,3 +41,19 @@ df.pivot { city }.groupBy { name.lastName }.min()
3241
<!---END-->
3342

3443
See [statistics](summaryStatistics.md#groupby-statistics) for details on complex data aggregations.
44+
45+
### Type Conversion
46+
47+
The following automatic type conversions are performed for the `min` and `max` operations.
48+
(Note that `null` only appears in the return type when using `-orNull` overloads).
49+
50+
| Conversion | skipNaN option |
51+
|------------------------------------------|----------------|
52+
| T -> T? where T : Comparable<T> | |
53+
| Int -> Int? | |
54+
| Byte -> Byte? | |
55+
| Short -> Short? | |
56+
| Long -> Long? | |
57+
| Double -> Double? | yes |
58+
| Float -> Float? | yes |
59+
| Nothing / no values -> Nothing? (`null`) | |
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[//]: # (title: Number Unification)
2+
3+
// TODO

0 commit comments

Comments
 (0)