diff --git a/docs/source/_build/API_REFERENCE_LINKS.yml b/docs/source/_build/API_REFERENCE_LINKS.yml index 1e301f592cb1..7d2bf10defa4 100644 --- a/docs/source/_build/API_REFERENCE_LINKS.yml +++ b/docs/source/_build/API_REFERENCE_LINKS.yml @@ -42,13 +42,19 @@ python: Expr.str: name: "str namespace" link: https://docs.pola.rs/api/python/stable/reference/expressions/string.html + Expr.arr: + name: "arr namespace" + link: https://docs.pola.rs/api/python/stable/reference/expressions/array.html element: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.element.html all: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.all.html exclude: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.exclude.html alias: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.alias.html prefix: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.prefix.html suffix: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.suffix.html + map: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.map.html n_unique: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.n_unique.html + unique: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.unique.html + unique_counts: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.unique_counts.html approx_n_unique: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.approx_n_unique.html when: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.when.html concat_list: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.concat_list.html @@ -116,6 +122,7 @@ python: name: log link: https://numpy.org/doc/stable/reference/generated/numpy.log.html feature_flags: ['numpy'] + List: https://docs.pola.rs/api/python/stable/reference/api/polars.datatypes.List.html Array: https://docs.pola.rs/api/python/stable/reference/api/polars.datatypes.Array.html Series.arr: https://docs.pola.rs/api/python/stable/reference/series/array.html Series.dt.day: https://docs.pola.rs/api/python/stable/reference/series/api/polars.Series.dt.day.html @@ -155,6 +162,8 @@ python: str.to_date: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.to_date.html str.len_chars: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.len_chars.html str.len_bytes: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.len_bytes.html + str.head: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.head.html + str.tail: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.str.tail.html struct.field: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.struct.field.html struct.rename_fields: https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.struct.rename_fields.html @@ -193,7 +202,11 @@ rust: operators: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Operator.html - Array: https://docs.pola.rs/api/rust/dev/polars/datatypes/enum.DataType.html#variant.Array + List: https://docs.pola.rs/api/rust/dev/polars/datatypes/enum.DataType.html#variant.List + Array: + name: Array + link: https://docs.pola.rs/api/rust/dev/polars/datatypes/enum.DataType.html#variant.Array + feature_flags: [dtype-array] DataFrame.explode: https://docs.pola.rs/api/rust/dev/polars/frame/struct.DataFrame.html#method.explode pivot: https://docs.pola.rs/api/rust/dev/polars_lazy/frame/pivot/fn.pivot.html @@ -313,6 +326,11 @@ rust: fill_nan: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.fill_nan fill_null: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.fill_null n_unique: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.n_unique + unique: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.unique + unique_counts: + name: unique_counts + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.unique_counts + feature_flags: ['unique_counts'] null_count: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.null_count interpolate: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/enum.Expr.html#method.interpolate is_between: @@ -333,6 +351,10 @@ rust: name: "str namespace" link: https://docs.pola.rs/api/rust/dev/polars/prelude/trait.StringNameSpaceImpl.html feature_flags: [strings] + Expr.arr: + name: "`arr` namespace" + link: https://docs.pola.rs/api/rust/dev/polars/prelude/enum.Expr.html#method.arr + feature_flags: [dtype-array] Series.arr: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ArrayNameSpace.html date_range: @@ -377,6 +399,12 @@ rust: name: str.contains link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.contains feature_flags: [regex] + str.head: + name: str.head + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.head + str.tail: + name: str.tail + link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.tail str.extract: name: str.extract link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/string/struct.StringNameSpace.html#method.extract @@ -420,6 +448,9 @@ rust: name: "name namespace" link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/struct.ExprNameNameSpace.html feature_flags: [lazy] + prefix: https://docs.rs/polars/latest/polars/prelude/struct.ExprNameNameSpace.html#method.prefix + suffix: https://docs.rs/polars/latest/polars/prelude/struct.ExprNameNameSpace.html#method.suffix + map: https://docs.rs/polars/latest/polars/prelude/struct.ExprNameNameSpace.html#method.map Expr.dt: name: "dt namespace" link: https://docs.pola.rs/api/rust/dev/polars_lazy/dsl/dt/struct.DateLikeNameSpace.html diff --git a/docs/source/_build/css/extra.css b/docs/source/_build/css/extra.css index 420db3966780..4f9cd5638a55 100644 --- a/docs/source/_build/css/extra.css +++ b/docs/source/_build/css/extra.css @@ -1,18 +1,20 @@ :root { - --md-primary-fg-color: #0B7189 ; - --md-primary-fg-color--light: #C2CCD6; - --md-primary-fg-color--dark: #103547; - --md-text-font: 'Proxima Nova', sans-serif; + --md-primary-fg-color: #0B7189; + --md-primary-fg-color--light: #C2CCD6; + --md-primary-fg-color--dark: #103547; + --md-text-font: 'Proxima Nova', sans-serif; } -span .md-typeset .emojione, .md-typeset .gemoji, .md-typeset .twemoji { - vertical-align: text-bottom; +span .md-typeset .emojione, +.md-typeset .gemoji, +.md-typeset .twemoji { + vertical-align: text-bottom; } @font-face { - font-family: 'Proxima Nova', sans-serif; - src: 'https://fonts.cdnfonts.com/css/proxima-nova-2' + font-family: 'Proxima Nova', sans-serif; + src: 'https://fonts.cdnfonts.com/css/proxima-nova-2' } :root { @@ -20,14 +22,14 @@ span .md-typeset .emojione, .md-typeset .gemoji, .md-typeset .twemoji { } .contributor_icon { - height:40px; - width:40px; - border-radius: 20px; - margin: 0 5px; + height: 40px; + width: 40px; + border-radius: 20px; + margin: 0 5px; } -.feature-flag{ - background-color: rgba(255, 245, 214,.5); +.feature-flag { + background-color: rgba(255, 245, 214, .5); border: none; padding: 0px 5px; text-align: center; @@ -38,27 +40,38 @@ span .md-typeset .emojione, .md-typeset .gemoji, .md-typeset .twemoji { font-size: .85em; } -[data-md-color-scheme=slate] .feature-flag{ - background-color:var(--md-code-bg-color); +[data-md-color-scheme=slate] .feature-flag { + background-color: var(--md-code-bg-color); } -.md-typeset ol li, .md-typeset ul li{ - margin-bottom: 0em !important; + +.md-typeset ol li, +.md-typeset ul li { + margin-bottom: 0em !important; } :root { - --md-admonition-icon--rust: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'%3E%3C!--! Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--%3E%3Cpath d='m508.52 249.75-21.82-13.51c-.17-2-.34-3.93-.55-5.88l18.72-17.5a7.35 7.35 0 0 0-2.44-12.25l-24-9c-.54-1.88-1.08-3.78-1.67-5.64l15-20.83a7.35 7.35 0 0 0-4.79-11.54l-25.42-4.15c-.9-1.73-1.79-3.45-2.73-5.15l10.68-23.42a7.35 7.35 0 0 0-6.95-10.39l-25.82.91q-1.79-2.22-3.61-4.4L439 81.84a7.36 7.36 0 0 0-8.84-8.84L405 78.93q-2.17-1.83-4.4-3.61l.91-25.82a7.35 7.35 0 0 0-10.39-7L367.7 53.23c-1.7-.94-3.43-1.84-5.15-2.73l-4.15-25.42a7.35 7.35 0 0 0-11.54-4.79L326 35.26c-1.86-.59-3.75-1.13-5.64-1.67l-9-24a7.35 7.35 0 0 0-12.25-2.44l-17.5 18.72c-1.95-.21-3.91-.38-5.88-.55L262.25 3.48a7.35 7.35 0 0 0-12.5 0L236.24 25.3c-2 .17-3.93.34-5.88.55l-17.5-18.72a7.35 7.35 0 0 0-12.25 2.44l-9 24c-1.89.55-3.79 1.08-5.66 1.68l-20.82-15a7.35 7.35 0 0 0-11.54 4.79l-4.15 25.41c-1.73.9-3.45 1.79-5.16 2.73l-23.4-10.63a7.35 7.35 0 0 0-10.39 7l.92 25.81c-1.49 1.19-3 2.39-4.42 3.61L81.84 73A7.36 7.36 0 0 0 73 81.84L78.93 107c-1.23 1.45-2.43 2.93-3.62 4.41l-25.81-.91a7.42 7.42 0 0 0-6.37 3.26 7.35 7.35 0 0 0-.57 7.13l10.66 23.41c-.94 1.7-1.83 3.43-2.73 5.16l-25.41 4.14a7.35 7.35 0 0 0-4.79 11.54l15 20.82c-.59 1.87-1.13 3.77-1.68 5.66l-24 9a7.35 7.35 0 0 0-2.44 12.25l18.72 17.5c-.21 1.95-.38 3.91-.55 5.88l-21.86 13.5a7.35 7.35 0 0 0 0 12.5l21.82 13.51c.17 2 .34 3.92.55 5.87l-18.72 17.5a7.35 7.35 0 0 0 2.44 12.25l24 9c.55 1.89 1.08 3.78 1.68 5.65l-15 20.83a7.35 7.35 0 0 0 4.79 11.54l25.42 4.15c.9 1.72 1.79 3.45 2.73 5.14l-10.63 23.43a7.35 7.35 0 0 0 .57 7.13 7.13 7.13 0 0 0 6.37 3.26l25.83-.91q1.77 2.22 3.6 4.4L73 430.16a7.36 7.36 0 0 0 8.84 8.84l25.16-5.93q2.18 1.83 4.41 3.61l-.92 25.82a7.35 7.35 0 0 0 10.39 6.95l23.43-10.68c1.69.94 3.42 1.83 5.14 2.73l4.15 25.42a7.34 7.34 0 0 0 11.54 4.78l20.83-15c1.86.6 3.76 1.13 5.65 1.68l9 24a7.36 7.36 0 0 0 12.25 2.44l17.5-18.72c1.95.21 3.92.38 5.88.55l13.51 21.82a7.35 7.35 0 0 0 12.5 0l13.51-21.82c2-.17 3.93-.34 5.88-.56l17.5 18.73a7.36 7.36 0 0 0 12.25-2.44l9-24c1.89-.55 3.78-1.08 5.65-1.68l20.82 15a7.34 7.34 0 0 0 11.54-4.78l4.15-25.42c1.72-.9 3.45-1.79 5.15-2.73l23.42 10.68a7.35 7.35 0 0 0 10.39-6.95l-.91-25.82q2.22-1.79 4.4-3.61l25.15 5.93a7.36 7.36 0 0 0 8.84-8.84L433.07 405q1.83-2.17 3.61-4.4l25.82.91a7.23 7.23 0 0 0 6.37-3.26 7.35 7.35 0 0 0 .58-7.13l-10.68-23.42c.94-1.7 1.83-3.43 2.73-5.15l25.42-4.15a7.35 7.35 0 0 0 4.79-11.54l-15-20.83c.59-1.87 1.13-3.76 1.67-5.65l24-9a7.35 7.35 0 0 0 2.44-12.25l-18.72-17.5c.21-1.95.38-3.91.55-5.87l21.82-13.51a7.35 7.35 0 0 0 0-12.5Zm-151 129.08A13.91 13.91 0 0 0 341 389.51l-7.64 35.67a187.51 187.51 0 0 1-156.36-.74l-7.64-35.66a13.87 13.87 0 0 0-16.46-10.68l-31.51 6.76a187.38 187.38 0 0 1-16.26-19.21H258.3c1.72 0 2.89-.29 2.89-1.91v-54.19c0-1.57-1.17-1.91-2.89-1.91h-44.83l.05-34.35H262c4.41 0 23.66 1.28 29.79 25.87 1.91 7.55 6.17 32.14 9.06 40 2.89 8.82 14.6 26.46 27.1 26.46H407a187.3 187.3 0 0 1-17.34 20.09Zm25.77 34.49A15.24 15.24 0 1 1 368 398.08h.44a15.23 15.23 0 0 1 14.8 15.24Zm-225.62-.68a15.24 15.24 0 1 1-15.25-15.25h.45a15.25 15.25 0 0 1 14.75 15.25Zm-88.1-178.49 32.83-14.6a13.88 13.88 0 0 0 7.06-18.33L102.69 186h26.56v119.73h-53.6a187.65 187.65 0 0 1-6.08-71.58Zm-11.26-36.06a15.24 15.24 0 0 1 15.23-15.25H74a15.24 15.24 0 1 1-15.67 15.24Zm155.16 24.49.05-35.32h63.26c3.28 0 23.07 3.77 23.07 18.62 0 12.29-15.19 16.7-27.68 16.7ZM399 306.71c-9.8 1.13-20.63-4.12-22-10.09-5.78-32.49-15.39-39.4-30.57-51.4 18.86-11.95 38.46-29.64 38.46-53.26 0-25.52-17.49-41.59-29.4-49.48-16.76-11-35.28-13.23-40.27-13.23h-198.9a187.49 187.49 0 0 1 104.89-59.19l23.47 24.6a13.82 13.82 0 0 0 19.6.44l26.26-25a187.51 187.51 0 0 1 128.37 91.43l-18 40.57a14 14 0 0 0 7.09 18.33l34.59 15.33a187.12 187.12 0 0 1 .4 32.54h-19.28c-1.91 0-2.69 1.27-2.69 3.13v8.82C421 301 409.31 305.58 399 306.71ZM240 60.21A15.24 15.24 0 0 1 255.21 45h.45A15.24 15.24 0 1 1 240 60.21ZM436.84 214a15.24 15.24 0 1 1 0-30.48h.44a15.24 15.24 0 0 1-.44 30.48Z'/%3E%3C/svg%3E"); - } - .md-typeset .admonition.rust, - .md-typeset details.rust { - border-color: rgb(205, 121, 44); - } - .md-typeset .rust > .admonition-title, - .md-typeset .rust > summary { - background-color: rgb(205, 121, 44,.1); - } - .md-typeset .rust > .admonition-title::before, - .md-typeset .rust > summary::before { - background-color:rgb(205, 121, 44); - -webkit-mask-image: var(--md-admonition-icon--rust); - mask-image: var(--md-admonition-icon--rust); - } \ No newline at end of file + --md-admonition-icon--rust: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 512 512'%3E%3C!--! Font Awesome Free 6.4.0 by @fontawesome - https://fontawesome.com License - https://fontawesome.com/license/free (Icons: CC BY 4.0, Fonts: SIL OFL 1.1, Code: MIT License) Copyright 2023 Fonticons, Inc.--%3E%3Cpath d='m508.52 249.75-21.82-13.51c-.17-2-.34-3.93-.55-5.88l18.72-17.5a7.35 7.35 0 0 0-2.44-12.25l-24-9c-.54-1.88-1.08-3.78-1.67-5.64l15-20.83a7.35 7.35 0 0 0-4.79-11.54l-25.42-4.15c-.9-1.73-1.79-3.45-2.73-5.15l10.68-23.42a7.35 7.35 0 0 0-6.95-10.39l-25.82.91q-1.79-2.22-3.61-4.4L439 81.84a7.36 7.36 0 0 0-8.84-8.84L405 78.93q-2.17-1.83-4.4-3.61l.91-25.82a7.35 7.35 0 0 0-10.39-7L367.7 53.23c-1.7-.94-3.43-1.84-5.15-2.73l-4.15-25.42a7.35 7.35 0 0 0-11.54-4.79L326 35.26c-1.86-.59-3.75-1.13-5.64-1.67l-9-24a7.35 7.35 0 0 0-12.25-2.44l-17.5 18.72c-1.95-.21-3.91-.38-5.88-.55L262.25 3.48a7.35 7.35 0 0 0-12.5 0L236.24 25.3c-2 .17-3.93.34-5.88.55l-17.5-18.72a7.35 7.35 0 0 0-12.25 2.44l-9 24c-1.89.55-3.79 1.08-5.66 1.68l-20.82-15a7.35 7.35 0 0 0-11.54 4.79l-4.15 25.41c-1.73.9-3.45 1.79-5.16 2.73l-23.4-10.63a7.35 7.35 0 0 0-10.39 7l.92 25.81c-1.49 1.19-3 2.39-4.42 3.61L81.84 73A7.36 7.36 0 0 0 73 81.84L78.93 107c-1.23 1.45-2.43 2.93-3.62 4.41l-25.81-.91a7.42 7.42 0 0 0-6.37 3.26 7.35 7.35 0 0 0-.57 7.13l10.66 23.41c-.94 1.7-1.83 3.43-2.73 5.16l-25.41 4.14a7.35 7.35 0 0 0-4.79 11.54l15 20.82c-.59 1.87-1.13 3.77-1.68 5.66l-24 9a7.35 7.35 0 0 0-2.44 12.25l18.72 17.5c-.21 1.95-.38 3.91-.55 5.88l-21.86 13.5a7.35 7.35 0 0 0 0 12.5l21.82 13.51c.17 2 .34 3.92.55 5.87l-18.72 17.5a7.35 7.35 0 0 0 2.44 12.25l24 9c.55 1.89 1.08 3.78 1.68 5.65l-15 20.83a7.35 7.35 0 0 0 4.79 11.54l25.42 4.15c.9 1.72 1.79 3.45 2.73 5.14l-10.63 23.43a7.35 7.35 0 0 0 .57 7.13 7.13 7.13 0 0 0 6.37 3.26l25.83-.91q1.77 2.22 3.6 4.4L73 430.16a7.36 7.36 0 0 0 8.84 8.84l25.16-5.93q2.18 1.83 4.41 3.61l-.92 25.82a7.35 7.35 0 0 0 10.39 6.95l23.43-10.68c1.69.94 3.42 1.83 5.14 2.73l4.15 25.42a7.34 7.34 0 0 0 11.54 4.78l20.83-15c1.86.6 3.76 1.13 5.65 1.68l9 24a7.36 7.36 0 0 0 12.25 2.44l17.5-18.72c1.95.21 3.92.38 5.88.55l13.51 21.82a7.35 7.35 0 0 0 12.5 0l13.51-21.82c2-.17 3.93-.34 5.88-.56l17.5 18.73a7.36 7.36 0 0 0 12.25-2.44l9-24c1.89-.55 3.78-1.08 5.65-1.68l20.82 15a7.34 7.34 0 0 0 11.54-4.78l4.15-25.42c1.72-.9 3.45-1.79 5.15-2.73l23.42 10.68a7.35 7.35 0 0 0 10.39-6.95l-.91-25.82q2.22-1.79 4.4-3.61l25.15 5.93a7.36 7.36 0 0 0 8.84-8.84L433.07 405q1.83-2.17 3.61-4.4l25.82.91a7.23 7.23 0 0 0 6.37-3.26 7.35 7.35 0 0 0 .58-7.13l-10.68-23.42c.94-1.7 1.83-3.43 2.73-5.15l25.42-4.15a7.35 7.35 0 0 0 4.79-11.54l-15-20.83c.59-1.87 1.13-3.76 1.67-5.65l24-9a7.35 7.35 0 0 0 2.44-12.25l-18.72-17.5c.21-1.95.38-3.91.55-5.87l21.82-13.51a7.35 7.35 0 0 0 0-12.5Zm-151 129.08A13.91 13.91 0 0 0 341 389.51l-7.64 35.67a187.51 187.51 0 0 1-156.36-.74l-7.64-35.66a13.87 13.87 0 0 0-16.46-10.68l-31.51 6.76a187.38 187.38 0 0 1-16.26-19.21H258.3c1.72 0 2.89-.29 2.89-1.91v-54.19c0-1.57-1.17-1.91-2.89-1.91h-44.83l.05-34.35H262c4.41 0 23.66 1.28 29.79 25.87 1.91 7.55 6.17 32.14 9.06 40 2.89 8.82 14.6 26.46 27.1 26.46H407a187.3 187.3 0 0 1-17.34 20.09Zm25.77 34.49A15.24 15.24 0 1 1 368 398.08h.44a15.23 15.23 0 0 1 14.8 15.24Zm-225.62-.68a15.24 15.24 0 1 1-15.25-15.25h.45a15.25 15.25 0 0 1 14.75 15.25Zm-88.1-178.49 32.83-14.6a13.88 13.88 0 0 0 7.06-18.33L102.69 186h26.56v119.73h-53.6a187.65 187.65 0 0 1-6.08-71.58Zm-11.26-36.06a15.24 15.24 0 0 1 15.23-15.25H74a15.24 15.24 0 1 1-15.67 15.24Zm155.16 24.49.05-35.32h63.26c3.28 0 23.07 3.77 23.07 18.62 0 12.29-15.19 16.7-27.68 16.7ZM399 306.71c-9.8 1.13-20.63-4.12-22-10.09-5.78-32.49-15.39-39.4-30.57-51.4 18.86-11.95 38.46-29.64 38.46-53.26 0-25.52-17.49-41.59-29.4-49.48-16.76-11-35.28-13.23-40.27-13.23h-198.9a187.49 187.49 0 0 1 104.89-59.19l23.47 24.6a13.82 13.82 0 0 0 19.6.44l26.26-25a187.51 187.51 0 0 1 128.37 91.43l-18 40.57a14 14 0 0 0 7.09 18.33l34.59 15.33a187.12 187.12 0 0 1 .4 32.54h-19.28c-1.91 0-2.69 1.27-2.69 3.13v8.82C421 301 409.31 305.58 399 306.71ZM240 60.21A15.24 15.24 0 0 1 255.21 45h.45A15.24 15.24 0 1 1 240 60.21ZM436.84 214a15.24 15.24 0 1 1 0-30.48h.44a15.24 15.24 0 0 1-.44 30.48Z'/%3E%3C/svg%3E"); +} + +.md-typeset .admonition.rust, +.md-typeset details.rust { + border-color: rgb(205, 121, 44); +} + +.md-typeset .rust>.admonition-title, +.md-typeset .rust>summary { + background-color: rgb(205, 121, 44, .1); +} + +.md-typeset .rust>.admonition-title::before, +.md-typeset .rust>summary::before { + background-color: rgb(205, 121, 44); + -webkit-mask-image: var(--md-admonition-icon--rust); + mask-image: var(--md-admonition-icon--rust); +} + +/* Adapt Excalidraw diagrams to dark mode. */ +body[data-md-color-scheme="slate"] .excalidraw svg { + will-change: filter; + filter: invert(100%) hue-rotate(180deg); +} diff --git a/docs/source/_build/js/mathjax.js b/docs/source/_build/js/mathjax.js new file mode 100644 index 000000000000..5b34852b5eee --- /dev/null +++ b/docs/source/_build/js/mathjax.js @@ -0,0 +1,19 @@ +window.MathJax = { + tex: { + inlineMath: [["\\(", "\\)"]], + displayMath: [["\\[", "\\]"]], + processEscapes: true, + processEnvironments: true + }, + options: { + ignoreHtmlClass: ".*|", + processHtmlClass: "arithmatex" + } +}; + +document$.subscribe(() => { + MathJax.startup.output.clearCache() + MathJax.typesetClear() + MathJax.texReset() + MathJax.typesetPromise() +}) diff --git a/docs/source/src/python/user-guide/concepts/contexts.py b/docs/source/src/python/user-guide/concepts/contexts.py deleted file mode 100644 index 7c6c2b4999fb..000000000000 --- a/docs/source/src/python/user-guide/concepts/contexts.py +++ /dev/null @@ -1,55 +0,0 @@ -# --8<-- [start:setup] -import polars as pl -import numpy as np - -np.random.seed(12) -# --8<-- [end:setup] - -# --8<-- [start:dataframe] -df = pl.DataFrame( - { - "nrs": [1, 2, 3, None, 5], - "names": ["foo", "ham", "spam", "egg", None], - "random": np.random.rand(5), - "groups": ["A", "A", "B", "C", "B"], - } -) -print(df) -# --8<-- [end:dataframe] - -# --8<-- [start:select] - -out = df.select( - pl.sum("nrs"), - pl.col("names").sort(), - pl.col("names").first().alias("first name"), - (pl.mean("nrs") * 10).alias("10xnrs"), -) -print(out) -# --8<-- [end:select] - -# --8<-- [start:filter] -out = df.filter(pl.col("nrs") > 2) -print(out) -# --8<-- [end:filter] - -# --8<-- [start:with_columns] - -df = df.with_columns( - pl.sum("nrs").alias("nrs_sum"), - pl.col("random").count().alias("count"), -) -print(df) -# --8<-- [end:with_columns] - - -# --8<-- [start:group_by] -out = df.group_by("groups").agg( - pl.sum("nrs"), # sum nrs by groups - pl.col("random").count().alias("count"), # count group members - # sum random where name != null - pl.col("random").filter(pl.col("names").is_not_null()).sum().name.suffix("_sum"), - pl.col("names").reverse().alias("reversed names"), -) -print(out) -# --8<-- [end:group_by] diff --git a/docs/source/src/python/user-guide/concepts/data-types/categoricals.py b/docs/source/src/python/user-guide/concepts/data-types/categoricals.py deleted file mode 100644 index c37a70be3e9d..000000000000 --- a/docs/source/src/python/user-guide/concepts/data-types/categoricals.py +++ /dev/null @@ -1,107 +0,0 @@ -# --8<-- [start:setup] -import polars as pl - -# --8<-- [end:setup] - -# --8<-- [start:example] -enum_dtype = pl.Enum(["Polar", "Panda", "Brown"]) -enum_series = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=enum_dtype) -cat_series = pl.Series( - ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical -) -# --8<-- [end:example] - - -# --8<-- [start:append] -cat_series = pl.Series( - ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical -) -cat2_series = pl.Series( - ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical -) -# Triggers a CategoricalRemappingWarning: Local categoricals have different encodings, expensive re-encoding is done -print(cat_series.append(cat2_series)) -# --8<-- [end:append] - - -# --8<-- [start:global_append] -with pl.StringCache(): - cat_series = pl.Series( - ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical - ) - cat2_series = pl.Series( - ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical - ) - print(cat_series.append(cat2_series)) -# --8<-- [end:global_append] - - -# --8<-- [start:enum_append] -dtype = pl.Enum(["Polar", "Panda", "Brown"]) -cat_series = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=dtype) -cat2_series = pl.Series(["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=dtype) -print(cat_series.append(cat2_series)) -# --8<-- [end:enum_append] - -# --8<-- [start:enum_error] -dtype = pl.Enum(["Polar", "Panda", "Brown"]) -try: - cat_series = pl.Series(["Polar", "Panda", "Brown", "Black"], dtype=dtype) -except Exception as e: - print(e) -# --8<-- [end:enum_error] - -# --8<-- [start:equality] -dtype = pl.Enum(["Polar", "Panda", "Brown"]) -cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=dtype) -cat_series2 = pl.Series(["Polar", "Panda", "Brown"], dtype=dtype) -print(cat_series == cat_series2) -# --8<-- [end:equality] - -# --8<-- [start:global_equality] -with pl.StringCache(): - cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) - cat_series2 = pl.Series(["Polar", "Panda", "Black"], dtype=pl.Categorical) - print(cat_series == cat_series2) -# --8<-- [end:global_equality] - -# --8<-- [start:equality] -dtype = pl.Enum(["Polar", "Panda", "Brown"]) -cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=dtype) -cat_series2 = pl.Series(["Polar", "Panda", "Brown"], dtype=dtype) -print(cat_series == cat_series2) -# --8<-- [end:equality] - -# --8<-- [start:str_compare_single] -cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) -print(cat_series <= "Cat") -# --8<-- [end:str_compare_single] - -# --8<-- [start:str_compare] -cat_series = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) -cat_series_utf = pl.Series(["Panda", "Panda", "Polar"]) -print(cat_series <= cat_series_utf) -# --8<-- [end:str_compare] - -# --8<-- [start:str_enum_compare_error] -try: - cat_series = pl.Series( - ["Low", "Medium", "High"], dtype=pl.Enum(["Low", "Medium", "High"]) - ) - cat_series <= "Excellent" -except Exception as e: - print(e) -# --8<-- [end:str_enum_compare_error] - -# --8<-- [start:str_enum_compare_single] -dtype = pl.Enum(["Low", "Medium", "High"]) -cat_series = pl.Series(["Low", "Medium", "High"], dtype=dtype) -print(cat_series <= "Medium") -# --8<-- [end:str_enum_compare_single] - -# --8<-- [start:str_enum_compare] -dtype = pl.Enum(["Low", "Medium", "High"]) -cat_series = pl.Series(["Low", "Medium", "High"], dtype=dtype) -cat_series2 = pl.Series(["High", "High", "Low"], dtype=dtype) -print(cat_series <= cat_series2) -# --8<-- [end:str_enum_compare] diff --git a/docs/source/src/python/user-guide/expressions/aggregation.py b/docs/source/src/python/user-guide/expressions/aggregation.py index f67226fdc3d7..d3d9ee58e88b 100644 --- a/docs/source/src/python/user-guide/expressions/aggregation.py +++ b/docs/source/src/python/user-guide/expressions/aggregation.py @@ -1,9 +1,6 @@ -# --8<-- [start:setup] +# --8<-- [start:dataframe] import polars as pl -# --8<-- [end:setup] - -# --8<-- [start:dataframe] url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv" schema_overrides = { @@ -26,7 +23,7 @@ .agg( pl.len(), pl.col("gender"), - pl.first("last_name"), + pl.first("last_name"), # Short for `pl.col("last_name").first()` ) .sort("len", descending=True) .limit(5) @@ -56,7 +53,7 @@ q = ( dataset.lazy() .group_by("state", "party") - .agg(pl.count("party").alias("count")) + .agg(pl.len().alias("count")) .filter( (pl.col("party") == "Anti-Administration") | (pl.col("party") == "Pro-Administration") @@ -104,8 +101,26 @@ def avg_birthday(gender: str) -> pl.Expr: # --8<-- [end:filter] +# --8<-- [start:filter-nested] +q = ( + dataset.lazy() + .group_by("state", "gender") + .agg( + # The function `avg_birthday` is not needed: + compute_age().mean().alias("avg birthday"), + pl.len().alias("#"), + ) + .sort("#", descending=True) + .limit(5) +) + +df = q.collect() +print(df) +# --8<-- [end:filter-nested] + + # --8<-- [start:sort] -def get_person() -> pl.Expr: +def get_name() -> pl.Expr: return pl.col("first_name") + pl.lit(" ") + pl.col("last_name") @@ -114,8 +129,8 @@ def get_person() -> pl.Expr: .sort("birthday", descending=True) .group_by("state") .agg( - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), + get_name().first().alias("youngest"), + get_name().last().alias("oldest"), ) .limit(5) ) @@ -126,7 +141,7 @@ def get_person() -> pl.Expr: # --8<-- [start:sort2] -def get_person() -> pl.Expr: +def get_name() -> pl.Expr: return pl.col("first_name") + pl.lit(" ") + pl.col("last_name") @@ -135,9 +150,9 @@ def get_person() -> pl.Expr: .sort("birthday", descending=True) .group_by("state") .agg( - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), - get_person().sort().first().alias("alphabetical_first"), + get_name().first().alias("youngest"), + get_name().last().alias("oldest"), + get_name().sort().first().alias("alphabetical_first"), ) .limit(5) ) @@ -148,7 +163,7 @@ def get_person() -> pl.Expr: # --8<-- [start:sort3] -def get_person() -> pl.Expr: +def get_name() -> pl.Expr: return pl.col("first_name") + pl.lit(" ") + pl.col("last_name") @@ -157,10 +172,10 @@ def get_person() -> pl.Expr: .sort("birthday", descending=True) .group_by("state") .agg( - get_person().first().alias("youngest"), - get_person().last().alias("oldest"), - get_person().sort().first().alias("alphabetical_first"), - pl.col("gender").sort_by(get_person()).first(), + get_name().first().alias("youngest"), + get_name().last().alias("oldest"), + get_name().sort().first().alias("alphabetical_first"), + pl.col("gender").sort_by(get_name()).first(), ) .sort("state") .limit(5) diff --git a/docs/source/src/python/user-guide/expressions/casting.py b/docs/source/src/python/user-guide/expressions/casting.py index bd06f4038843..1cc5477e8364 100644 --- a/docs/source/src/python/user-guide/expressions/casting.py +++ b/docs/source/src/python/user-guide/expressions/casting.py @@ -1,16 +1,11 @@ -# --8<-- [start:setup] - +# --8<-- [start:dfnum] import polars as pl -# --8<-- [end:setup] - -# --8<-- [start:dfnum] df = pl.DataFrame( { - "integers": [1, 2, 3, 4, 5], - "big_integers": [1, 10000002, 3, 10000004, 10000005], - "floats": [4.0, 5.0, 6.0, 7.0, 8.0], - "floats_with_decimal": [4.532, 5.5, 6.5, 7.5, 8.5], + "integers": [1, 2, 3], + "big_integers": [10000002, 2, 30000003], + "floats": [4.0, 5.8, -6.3], } ) @@ -21,28 +16,28 @@ out = df.select( pl.col("integers").cast(pl.Float32).alias("integers_as_floats"), pl.col("floats").cast(pl.Int32).alias("floats_as_integers"), - pl.col("floats_with_decimal") - .cast(pl.Int32) - .alias("floats_with_decimal_as_integers"), ) print(out) # --8<-- [end:castnum] # --8<-- [start:downcast] -out = df.select( - pl.col("integers").cast(pl.Int16).alias("integers_smallfootprint"), - pl.col("floats").cast(pl.Float32).alias("floats_smallfootprint"), +print(f"Before downcasting: {df.estimated_size()} bytes") +out = df.with_columns( + pl.col("integers").cast(pl.Int16), + pl.col("floats").cast(pl.Float32), ) -print(out) +print(f"After downcasting: {out.estimated_size()} bytes") # --8<-- [end:downcast] # --8<-- [start:overflow] +import polars.exceptions as plexc + try: out = df.select(pl.col("big_integers").cast(pl.Int8)) print(out) -except Exception as e: - print(e) +except plexc.InvalidOperationError as err: + print("InvalidOperationError:", err) # --8<-- [end:overflow] # --8<-- [start:overflow2] @@ -54,28 +49,31 @@ # --8<-- [start:strings] df = pl.DataFrame( { - "integers": [1, 2, 3, 4, 5], - "float": [4.0, 5.03, 6.0, 7.0, 8.0], - "floats_as_string": ["4.0", "5.0", "6.0", "7.0", "8.0"], + "integers_as_strings": ["1", "2", "3"], + "floats_as_strings": ["4.0", "5.8", "-6.3"], + "floats": [4.0, 5.8, -6.3], } ) out = df.select( - pl.col("integers").cast(pl.String), - pl.col("float").cast(pl.String), - pl.col("floats_as_string").cast(pl.Float64), + pl.col("integers_as_strings").cast(pl.Int32), + pl.col("floats_as_strings").cast(pl.Float64), + pl.col("floats").cast(pl.String), ) print(out) # --8<-- [end:strings] # --8<-- [start:strings2] -df = pl.DataFrame({"strings_not_float": ["4.0", "not_a_number", "6.0", "7.0", "8.0"]}) +df = pl.DataFrame( + { + "floats": ["4.0", "5.8", "- 6 . 3"], + } +) try: - out = df.select(pl.col("strings_not_float").cast(pl.Float64)) - print(out) -except Exception as e: - print(e) + out = df.select(pl.col("floats").cast(pl.Float64)) +except plexc.InvalidOperationError as err: + print("InvalidOperationError:", err) # --8<-- [end:strings2] # --8<-- [start:bool] @@ -87,37 +85,47 @@ } ) -out = df.select(pl.col("integers").cast(pl.Boolean), pl.col("floats").cast(pl.Boolean)) +out = df.select( + pl.col("integers").cast(pl.Boolean), + pl.col("floats").cast(pl.Boolean), + pl.col("bools").cast(pl.UInt8), +) print(out) # --8<-- [end:bool] # --8<-- [start:dates] -from datetime import date, datetime +from datetime import date, datetime, time df = pl.DataFrame( { - "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 5), eager=True), - "datetime": pl.datetime_range( - datetime(2022, 1, 1), datetime(2022, 1, 5), eager=True - ), + "date": [ + date(1970, 1, 1), # epoch + date(1970, 1, 10), # 9 days later + ], + "datetime": [ + datetime(1970, 1, 1, 0, 0, 0), # epoch + datetime(1970, 1, 1, 0, 0, 0, 500), # 500 us later + ], + "time": [ + time(0, 0, 0), # reference time + time(0, 0, 1), # 1 second later + ], } ) -out = df.select(pl.col("date").cast(pl.Int64), pl.col("datetime").cast(pl.Int64)) +out = df.select( + pl.col("date").cast(pl.Int64).alias("days_since_epoch"), + pl.col("datetime").cast(pl.Int64).alias("us_since_epoch"), + pl.col("time").cast(pl.Int64).alias("ns_since_midnight"), +) print(out) # --8<-- [end:dates] # --8<-- [start:dates2] df = pl.DataFrame( { - "date": pl.date_range(date(2022, 1, 1), date(2022, 1, 5), eager=True), - "string": [ - "2022-01-01", - "2022-01-02", - "2022-01-03", - "2022-01-04", - "2022-01-05", - ], + "date": [date(2022, 1, 1), date(2022, 1, 2)], + "string": ["2022-01-01", "2022-01-02"], } ) diff --git a/docs/source/src/python/user-guide/expressions/categoricals.py b/docs/source/src/python/user-guide/expressions/categoricals.py new file mode 100644 index 000000000000..b857a2386092 --- /dev/null +++ b/docs/source/src/python/user-guide/expressions/categoricals.py @@ -0,0 +1,206 @@ +# --8<-- [start:enum-example] +import polars as pl + +bears_enum = pl.Enum(["Polar", "Panda", "Brown"]) +bears = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=bears_enum) +print(bears) +# --8<-- [end:enum-example] + +# --8<-- [start:enum-wrong-value] +import polars.exceptions as plexc + +try: + bears_kind_of = pl.Series( + ["Polar", "Panda", "Brown", "Polar", "Shark"], + dtype=bears_enum, + ) +except plexc.InvalidOperationError as exc: + print("InvalidOperationError:", exc) +# --8<-- [end:enum-wrong-value] + +# --8<-- [start:log-levels] +log_levels = pl.Enum(["debug", "info", "warning", "error"]) + +logs = pl.DataFrame( + { + "level": ["debug", "info", "debug", "error"], + "message": [ + "process id: 525", + "Service started correctly", + "startup time: 67ms", + "Cannot connect to DB!", + ], + }, + schema_overrides={ + "level": log_levels, + }, +) + +non_debug_logs = logs.filter( + pl.col("level") > "debug", +) +print(non_debug_logs) +# --8<-- [end:log-levels] + +# --8<-- [start:categorical-example] +bears_cat = pl.Series( + ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical +) +print(bears_cat) +# --8<-- [end:categorical-example] + +# --8<-- [start:categorical-comparison-string] +print(bears_cat < "Cat") +# --8<-- [end:categorical-comparison-string] + +# --8<-- [start:categorical-comparison-string-column] +bears_str = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], +) +print(bears_cat == bears_str) +# --8<-- [end:categorical-comparison-string-column] + +# --8<-- [start:categorical-comparison-categorical-column] +bears_cat2 = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], + dtype=pl.Categorical, +) + +try: + print(bears_cat == bears_cat2) +except plexc.StringCacheMismatchError as exc: + exc_str = str(exc).splitlines()[0] + print("StringCacheMismatchError:", exc_str) +# --8<-- [end:categorical-comparison-categorical-column] + +# --8<-- [start:stringcache-categorical-equality] +with pl.StringCache(): + bears_cat = pl.Series( + ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical + ) + bears_cat2 = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical + ) + +print(bears_cat == bears_cat2) +# --8<-- [end:stringcache-categorical-equality] + +# --8<-- [start:stringcache-categorical-comparison-lexical] +with pl.StringCache(): + bears_cat = pl.Series( + ["Polar", "Panda", "Brown", "Brown", "Polar"], + dtype=pl.Categorical(ordering="lexical"), + ) + bears_cat2 = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical + ) + +print(bears_cat > bears_cat2) +# --8<-- [end:stringcache-categorical-comparison-lexical] + +# --8<-- [start:stringcache-categorical-comparison-physical] +with pl.StringCache(): + bears_cat = pl.Series( + # Polar < Panda < Brown + ["Polar", "Panda", "Brown", "Brown", "Polar"], + dtype=pl.Categorical, + ) + bears_cat2 = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical + ) + +print(bears_cat > bears_cat2) +# --8<-- [end:stringcache-categorical-comparison-physical] + +# --8<-- [start:concatenating-categoricals] +male_bears = pl.DataFrame( + { + "species": ["Polar", "Brown", "Panda"], + "weight": [450, 500, 110], # kg + }, + schema_overrides={"species": pl.Categorical}, +) +female_bears = pl.DataFrame( + { + "species": ["Brown", "Polar", "Panda"], + "weight": [340, 200, 90], # kg + }, + schema_overrides={"species": pl.Categorical}, +) + +bears = pl.concat([male_bears, female_bears], how="vertical") +print(bears) +# --8<-- [end:concatenating-categoricals] + + +# --8<-- [start:example] +import polars as pl + +bears_enum = pl.Enum(["Polar", "Panda", "Brown"]) +bears = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=bears_enum) +print(bears) + +cat_bears = pl.Series( + ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical +) +# --8<-- [end:example] + + +# --8<-- [start:append] +cat_bears = pl.Series( + ["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=pl.Categorical +) +cat2_series = pl.Series( + ["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=pl.Categorical +) + +# Triggers a CategoricalRemappingWarning. +print(cat_bears.append(cat2_series)) +# --8<-- [end:append] + +# --8<-- [start:enum_append] +dtype = pl.Enum(["Polar", "Panda", "Brown"]) +cat_bears = pl.Series(["Polar", "Panda", "Brown", "Brown", "Polar"], dtype=dtype) +cat2_series = pl.Series(["Panda", "Brown", "Brown", "Polar", "Polar"], dtype=dtype) +print(cat_bears.append(cat2_series)) +# --8<-- [end:enum_append] + +# --8<-- [start:enum_error] +dtype = pl.Enum(["Polar", "Panda", "Brown"]) +try: + cat_bears = pl.Series(["Polar", "Panda", "Brown", "Black"], dtype=dtype) +except Exception as e: + print(e) +# --8<-- [end:enum_error] + +# --8<-- [start:equality] +dtype = pl.Enum(["Polar", "Panda", "Brown"]) +cat_bears = pl.Series(["Brown", "Panda", "Polar"], dtype=dtype) +cat_series2 = pl.Series(["Polar", "Panda", "Brown"], dtype=dtype) +print(cat_bears == cat_series2) +# --8<-- [end:equality] + +# --8<-- [start:global_equality] +with pl.StringCache(): + cat_bears = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) + cat_series2 = pl.Series(["Polar", "Panda", "Black"], dtype=pl.Categorical) + print(cat_bears == cat_series2) +# --8<-- [end:global_equality] + +# --8<-- [start:equality] +dtype = pl.Enum(["Polar", "Panda", "Brown"]) +cat_bears = pl.Series(["Brown", "Panda", "Polar"], dtype=dtype) +cat_series2 = pl.Series(["Polar", "Panda", "Brown"], dtype=dtype) +print(cat_bears == cat_series2) +# --8<-- [end:equality] + +# --8<-- [start:str_compare_single] +cat_bears = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) +print(cat_bears <= "Cat") +# --8<-- [end:str_compare_single] + +# --8<-- [start:str_compare] +cat_bears = pl.Series(["Brown", "Panda", "Polar"], dtype=pl.Categorical) +cat_series_utf = pl.Series(["Panda", "Panda", "Polar"]) +print(cat_bears <= cat_series_utf) +# --8<-- [end:str_compare] diff --git a/docs/source/src/python/user-guide/expressions/column-selections.py b/docs/source/src/python/user-guide/expressions/column-selections.py index 4454a1a3d970..61f3fdb44a09 100644 --- a/docs/source/src/python/user-guide/expressions/column-selections.py +++ b/docs/source/src/python/user-guide/expressions/column-selections.py @@ -1,5 +1,3 @@ -# --8<-- [start:setup] -# --8<-- [end:setup] # --8<-- [start:selectors_df] from datetime import date, datetime diff --git a/docs/source/src/python/user-guide/expressions/expression-expansion.py b/docs/source/src/python/user-guide/expressions/expression-expansion.py new file mode 100644 index 000000000000..9df68540f6ac --- /dev/null +++ b/docs/source/src/python/user-guide/expressions/expression-expansion.py @@ -0,0 +1,197 @@ +# --8<-- [start:df] +import polars as pl + +df = pl.DataFrame( + { # As of 14th October 2024, ~3pm UTC + "ticker": ["AAPL", "NVDA", "MSFT", "GOOG", "AMZN"], + "company_name": ["Apple", "NVIDIA", "Microsoft", "Alphabet (Google)", "Amazon"], + "price": [229.9, 138.93, 420.56, 166.41, 188.4], + "day_high": [231.31, 139.6, 424.04, 167.62, 189.83], + "day_low": [228.6, 136.3, 417.52, 164.78, 188.44], + "year_high": [237.23, 140.76, 468.35, 193.31, 201.2], + "year_low": [164.08, 39.23, 324.39, 121.46, 118.35], + } +) + +print(df) +# --8<-- [end:df] + +# --8<-- [start:col-with-names] +eur_usd_rate = 1.09 # As of 14th October 2024 + +result = df.with_columns( + ( + pl.col( + "price", + "day_high", + "day_low", + "year_high", + "year_low", + ) + / eur_usd_rate + ).round(2) +) +print(result) +# --8<-- [end:col-with-names] + +# --8<-- [start:expression-list] +exprs = [ + (pl.col("price") / eur_usd_rate).round(2), + (pl.col("day_high") / eur_usd_rate).round(2), + (pl.col("day_low") / eur_usd_rate).round(2), + (pl.col("year_high") / eur_usd_rate).round(2), + (pl.col("year_low") / eur_usd_rate).round(2), +] + +result2 = df.with_columns(exprs) +print(result.equals(result2)) +# --8<-- [end:expression-list] + +# --8<-- [start:col-with-dtype] +result = df.with_columns((pl.col(pl.Float64) / eur_usd_rate).round(2)) +print(result) +# --8<-- [end:col-with-dtype] + +# --8<-- [start:col-with-dtypes] +result2 = df.with_columns( + ( + pl.col( + pl.Float32, + pl.Float64, + ) + / eur_usd_rate + ).round(2) +) +print(result.equals(result2)) +# --8<-- [end:col-with-dtypes] + +# --8<-- [start:col-with-regex] +result = df.select(pl.col("ticker", "^.*_high$", "^.*_low$")) +print(result) +# --8<-- [end:col-with-regex] + +# --8<-- [start:col-error] +try: + df.select(pl.col("ticker", pl.Float64)) +except TypeError as err: + print("TypeError:", err) +# --8<-- [end:col-error] + +# --8<-- [start:all] +result = df.select(pl.all()) +print(result.equals(df)) +# --8<-- [end:all] + +# --8<-- [start:all-exclude] +result = df.select(pl.all().exclude("^day_.*$")) +print(result) +# --8<-- [end:all-exclude] + +# --8<-- [start:col-exclude] +result = df.select(pl.col(pl.Float64).exclude("^day_.*$")) +print(result) +# --8<-- [end:col-exclude] + +# --8<-- [start:duplicate-error] +import polars.exceptions as plexc + +gbp_usd_rate = 1.31 # As of 14th October 2024 + +try: + df.select( + pl.col("price") / gbp_usd_rate, # This would be named "price"... + pl.col("price") / eur_usd_rate, # And so would this. + ) +except plexc.DuplicateError as err: + print("DuplicateError:", err) +# --8<-- [end:duplicate-error] + +# --8<-- [start:alias] +df.select( + (pl.col("price") / gbp_usd_rate).alias("price (GBP)"), + (pl.col("price") / eur_usd_rate).alias("price (EUR)"), +) +# --8<-- [end:alias] + +# --8<-- [start:prefix-suffix] +result = df.select( + (pl.col("^year_.*$") / eur_usd_rate).name.prefix("in_eur_"), + (pl.col("day_high", "day_low") / gbp_usd_rate).name.suffix("_gbp"), +) +print(result) +# --8<-- [end:prefix-suffix] + +# --8<-- [start:name-map] +result = df.select(pl.all().name.map(str.capitalize)) +print(result) +# --8<-- [end:name-map] + +# --8<-- [start:for-with_columns] +result = df +for tp in ["day", "year"]: + result = result.with_columns( + (pl.col(f"{tp}_high") - pl.col(f"{tp}_low")).alias(f"{tp}_amplitude") + ) +print(result) +# --8<-- [end:for-with_columns] + + +# --8<-- [start:yield-expressions] +def amplitude_expressions(time_periods): + for tp in time_periods: + yield (pl.col(f"{tp}_high") - pl.col(f"{tp}_low")).alias(f"{tp}_amplitude") + + +result = df.with_columns(amplitude_expressions(["day", "year"])) +print(result) +# --8<-- [end:yield-expressions] + +# --8<-- [start:selectors] +import polars.selectors as cs + +result = df.select(cs.string() | cs.ends_with("_high")) +print(result) +# --8<-- [end:selectors] + +# --8<-- [start:selectors-set-operations] +result = df.select(cs.contains("_") - cs.string()) +print(result) +# --8<-- [end:selectors-set-operations] + +# --8<-- [start:selectors-expressions] +result = df.select((cs.contains("_") - cs.string()) / eur_usd_rate) +print(result) +# --8<-- [end:selectors-expressions] + +# --8<-- [start:selector-ambiguity] +people = pl.DataFrame( + { + "name": ["Anna", "Bob"], + "has_partner": [True, False], + "has_kids": [False, False], + "has_tattoos": [True, False], + "is_alive": [True, True], + } +) + +wrong_result = people.select((~cs.starts_with("has_")).name.prefix("not_")) +print(wrong_result) +# --8<-- [end:selector-ambiguity] + +# --8<-- [start:as_expr] +result = people.select((~cs.starts_with("has_").as_expr()).name.prefix("not_")) +print(result) +# --8<-- [end:as_expr] + +# --8<-- [start:is_selector] +print(cs.is_selector(~cs.starts_with("has_").as_expr())) +# --8<-- [end:is_selector] + +# --8<-- [start:expand_selector] +print( + cs.expand_selector( + people, + cs.starts_with("has_"), + ) +) +# --8<-- [end:expand_selector] diff --git a/docs/source/src/python/user-guide/expressions/folds.py b/docs/source/src/python/user-guide/expressions/folds.py index 803591b5b581..f0be44b29cb5 100644 --- a/docs/source/src/python/user-guide/expressions/folds.py +++ b/docs/source/src/python/user-guide/expressions/folds.py @@ -1,24 +1,63 @@ -# --8<-- [start:setup] +# --8<-- [start:mansum] +import operator import polars as pl -# --8<-- [end:setup] - -# --8<-- [start:mansum] df = pl.DataFrame( { + "label": ["foo", "bar", "spam"], "a": [1, 2, 3], "b": [10, 20, 30], } ) -out = df.select( - pl.fold(acc=pl.lit(0), function=lambda acc, x: acc + x, exprs=pl.all()).alias( - "sum" - ), +result = df.select( + pl.fold( + acc=pl.lit(0), + function=operator.add, + exprs=pl.col("a", "b"), + ).alias("sum_fold"), + pl.sum_horizontal(pl.col("a", "b")).alias("sum_horz"), ) -print(out) + +print(result) # --8<-- [end:mansum] +# --8<-- [start:mansum-explicit] +acc = pl.lit(0) +f = operator.add + +result = df.select( + f(f(acc, pl.col("a")), pl.col("b")), + pl.fold(acc=acc, function=f, exprs=pl.col("a", "b")).alias("sum_fold"), +) + +print(result) +# --8<-- [end:mansum-explicit] + +# --8<-- [start:manprod] +result = df.select( + pl.fold( + acc=pl.lit(0), + function=operator.mul, + exprs=pl.col("a", "b"), + ).alias("prod"), +) + +print(result) +# --8<-- [end:manprod] + +# --8<-- [start:manprod-fixed] +result = df.select( + pl.fold( + acc=pl.lit(1), + function=operator.mul, + exprs=pl.col("a", "b"), + ).alias("prod"), +) + +print(result) +# --8<-- [end:manprod-fixed] + # --8<-- [start:conditional] df = pl.DataFrame( { @@ -27,14 +66,14 @@ } ) -out = df.filter( +result = df.filter( pl.fold( acc=pl.lit(True), function=lambda acc, x: acc & x, - exprs=pl.col("*") > 1, + exprs=pl.all() > 1, ) ) -print(out) +print(result) # --8<-- [end:conditional] # --8<-- [start:string] @@ -45,6 +84,6 @@ } ) -out = df.select(pl.concat_str(["a", "b"])) -print(out) +result = df.select(pl.concat_str(["a", "b"])) +print(result) # --8<-- [end:string] diff --git a/docs/source/src/python/user-guide/expressions/functions.py b/docs/source/src/python/user-guide/expressions/functions.py deleted file mode 100644 index 5f9bbd5bb1da..000000000000 --- a/docs/source/src/python/user-guide/expressions/functions.py +++ /dev/null @@ -1,60 +0,0 @@ -# --8<-- [start:setup] - -import polars as pl -import numpy as np - -np.random.seed(12) -# --8<-- [end:setup] - -# --8<-- [start:dataframe] -df = pl.DataFrame( - { - "nrs": [1, 2, 3, None, 5], - "names": ["foo", "ham", "spam", "egg", "spam"], - "random": np.random.rand(5), - "groups": ["A", "A", "B", "C", "B"], - } -) -print(df) -# --8<-- [end:dataframe] - -# --8<-- [start:samename] -df_samename = df.select(pl.col("nrs") + 5) -print(df_samename) -# --8<-- [end:samename] - - -# --8<-- [start:samenametwice] -try: - df_samename2 = df.select(pl.col("nrs") + 5, pl.col("nrs") - 5) - print(df_samename2) -except Exception as e: - print(e) -# --8<-- [end:samenametwice] - -# --8<-- [start:samenamealias] -df_alias = df.select( - (pl.col("nrs") + 5).alias("nrs + 5"), - (pl.col("nrs") - 5).alias("nrs - 5"), -) -print(df_alias) -# --8<-- [end:samenamealias] - -# --8<-- [start:countunique] -df_alias = df.select( - pl.col("names").n_unique().alias("unique"), - pl.approx_n_unique("names").alias("unique_approx"), -) -print(df_alias) -# --8<-- [end:countunique] - -# --8<-- [start:conditional] -df_conditional = df.select( - pl.col("nrs"), - pl.when(pl.col("nrs") > 2) - .then(pl.lit(True)) - .otherwise(pl.lit(False)) - .alias("conditional"), -) -print(df_conditional) -# --8<-- [end:conditional] diff --git a/docs/source/src/python/user-guide/expressions/lists.py b/docs/source/src/python/user-guide/expressions/lists.py index edd0092330d5..b6f56e6a6722 100644 --- a/docs/source/src/python/user-guide/expressions/lists.py +++ b/docs/source/src/python/user-guide/expressions/lists.py @@ -1,12 +1,74 @@ -# --8<-- [start:setup] +# --8<-- [start:list-example] +from datetime import datetime import polars as pl -# --8<-- [end:setup] +df = pl.DataFrame( + { + "names": [ + ["Anne", "Averill", "Adams"], + ["Brandon", "Brooke", "Borden", "Branson"], + ["Camila", "Campbell"], + ["Dennis", "Doyle"], + ], + "children_ages": [ + [5, 7], + [], + [], + [8, 11, 18], + ], + "medical_appointments": [ + [], + [], + [], + [datetime(2022, 5, 22, 16, 30)], + ], + } +) + +print(df) +# --8<-- [end:list-example] + +# --8<-- [start:array-example] +df = pl.DataFrame( + { + "bit_flags": [ + [True, True, True, True, False], + [False, True, True, True, True], + ], + "tic_tac_toe": [ + [ + [" ", "x", "o"], + [" ", "x", " "], + ["o", "x", " "], + ], + [ + ["o", "x", "x"], + [" ", "o", "x"], + [" ", " ", "o"], + ], + ], + }, + schema_overrides={ + "bit_flags": pl.Array(pl.Boolean, 5), + "tic_tac_toe": pl.Array(pl.String, (3, 3)), + }, +) + +print(df) +# --8<-- [end:array-example] -# --8<-- [start:weather_df] +# --8<-- [start:numpy-array-inference] +import numpy as np + +array = np.arange(0, 120).reshape((5, 2, 3, 4)) # 4D array + +print(pl.Series(array).dtype) # Column with the 3D subarrays +# --8<-- [end:numpy-array-inference] + +# --8<-- [start:weather] weather = pl.DataFrame( { - "station": ["Station " + str(x) for x in range(1, 6)], + "station": [f"Station {idx}" for idx in range(1, 6)], "temperatures": [ "20 5 5 E1 7 13 19 9 6 20", "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40", @@ -16,57 +78,55 @@ ], } ) -print(weather) -# --8<-- [end:weather_df] -# --8<-- [start:string_to_list] -out = weather.with_columns(pl.col("temperatures").str.split(" ")) -print(out) -# --8<-- [end:string_to_list] +print(weather) +# --8<-- [end:weather] -# --8<-- [start:explode_to_atomic] -out = weather.with_columns(pl.col("temperatures").str.split(" ")).explode( - "temperatures" -) -print(out) -# --8<-- [end:explode_to_atomic] - -# --8<-- [start:list_ops] -out = weather.with_columns(pl.col("temperatures").str.split(" ")).with_columns( - pl.col("temperatures").list.head(3).alias("top3"), - pl.col("temperatures").list.slice(-3, 3).alias("bottom_3"), - pl.col("temperatures").list.len().alias("obs"), +# --8<-- [start:split] +weather = weather.with_columns( + pl.col("temperatures").str.split(" "), ) -print(out) -# --8<-- [end:list_ops] +print(weather) +# --8<-- [end:split] +# --8<-- [start:explode] +result = weather.explode("temperatures") +print(result) +# --8<-- [end:explode] -# --8<-- [start:count_errors] -out = weather.with_columns( +# --8<-- [start:list-slicing] +result = weather.with_columns( + pl.col("temperatures").list.head(3).alias("head"), + pl.col("temperatures").list.tail(3).alias("tail"), + pl.col("temperatures").list.slice(-3, 2).alias("two_next_to_last"), +) +print(result) +# --8<-- [end:list-slicing] + +# --8<-- [start:element-wise-casting] +result = weather.with_columns( pl.col("temperatures") - .str.split(" ") .list.eval(pl.element().cast(pl.Int64, strict=False).is_null()) .list.sum() - .alias("errors") + .alias("errors"), ) -print(out) -# --8<-- [end:count_errors] +print(result) +# --8<-- [end:element-wise-casting] -# --8<-- [start:count_errors_regex] -out = weather.with_columns( +# --8<-- [start:element-wise-regex] +result2 = weather.with_columns( pl.col("temperatures") - .str.split(" ") .list.eval(pl.element().str.contains("(?i)[a-z]")) .list.sum() - .alias("errors") + .alias("errors"), ) -print(out) -# --8<-- [end:count_errors_regex] +print(result.equals(result2)) +# --8<-- [end:element-wise-regex] # --8<-- [start:weather_by_day] weather_by_day = pl.DataFrame( { - "station": ["Station " + str(x) for x in range(1, 11)], + "station": [f"Station {idx}" for idx in range(1, 11)], "day_1": [17, 11, 8, 22, 9, 21, 20, 8, 8, 17], "day_2": [15, 11, 10, 8, 7, 14, 18, 21, 15, 13], "day_3": [16, 15, 24, 24, 8, 23, 19, 23, 16, 10], @@ -75,10 +135,10 @@ print(weather_by_day) # --8<-- [end:weather_by_day] -# --8<-- [start:weather_by_day_rank] -rank_pct = (pl.element().rank(descending=True) / pl.col("*").count()).round(2) +# --8<-- [start:rank_pct] +rank_pct = (pl.element().rank(descending=True) / pl.all().count()).round(2) -out = weather_by_day.with_columns( +result = weather_by_day.with_columns( # create the list of homogeneous data pl.concat_list(pl.all().exclude("station")).alias("all_temps") ).select( @@ -88,27 +148,37 @@ pl.col("all_temps").list.eval(rank_pct, parallel=True).alias("temps_rank"), ) -print(out) -# --8<-- [end:weather_by_day_rank] - -# --8<-- [start:array_df] -array_df = pl.DataFrame( - [ - pl.Series("Array_1", [[1, 3], [2, 5]]), - pl.Series("Array_2", [[1, 7, 3], [8, 1, 0]]), - ], - schema={ - "Array_1": pl.Array(pl.Int64, 2), - "Array_2": pl.Array(pl.Int64, 3), +print(result) +# --8<-- [end:rank_pct] + +# --8<-- [start:array-overview] +df = pl.DataFrame( + { + "first_last": [ + ["Anne", "Adams"], + ["Brandon", "Branson"], + ["Camila", "Campbell"], + ["Dennis", "Doyle"], + ], + "fav_numbers": [ + [42, 0, 1], + [2, 3, 5], + [13, 21, 34], + [73, 3, 7], + ], + }, + schema_overrides={ + "first_last": pl.Array(pl.String, 2), + "fav_numbers": pl.Array(pl.Int32, 3), }, ) -print(array_df) -# --8<-- [end:array_df] -# --8<-- [start:array_ops] -out = array_df.select( - pl.col("Array_1").arr.min().name.suffix("_min"), - pl.col("Array_2").arr.sum().name.suffix("_sum"), +result = df.select( + pl.col("first_last").arr.join(" ").alias("name"), + pl.col("fav_numbers").arr.sort(), + pl.col("fav_numbers").arr.max().alias("largest_fav"), + pl.col("fav_numbers").arr.sum().alias("summed"), + pl.col("fav_numbers").arr.contains(3).alias("likes_3"), ) -print(out) -# --8<-- [end:array_ops] +print(result) +# --8<-- [end:array-overview] diff --git a/docs/source/src/python/user-guide/expressions/missing-data.py b/docs/source/src/python/user-guide/expressions/missing-data.py index f8944de3c9ee..f078f5a34aa7 100644 --- a/docs/source/src/python/user-guide/expressions/missing-data.py +++ b/docs/source/src/python/user-guide/expressions/missing-data.py @@ -1,10 +1,6 @@ -# --8<-- [start:setup] -import numpy as np +# --8<-- [start:dataframe] import polars as pl -# --8<-- [end:setup] - -# --8<-- [start:dataframe] df = pl.DataFrame( { "value": [1, None], @@ -31,8 +27,8 @@ # --8<-- [start:dataframe2] df = pl.DataFrame( { - "col1": [1, 2, 3], - "col2": [1, None, 3], + "col1": [0.5, 1, 1.5, 2, 2.5], + "col2": [1, None, 3, None, 5], }, ) print(df) @@ -41,25 +37,26 @@ # --8<-- [start:fill] fill_literal_df = df.with_columns( - pl.col("col2").fill_null(pl.lit(2)), + pl.col("col2").fill_null(3), ) print(fill_literal_df) # --8<-- [end:fill] -# --8<-- [start:fillstrategy] -fill_forward_df = df.with_columns( - pl.col("col2").fill_null(strategy="forward"), -) -print(fill_forward_df) -# --8<-- [end:fillstrategy] - # --8<-- [start:fillexpr] fill_median_df = df.with_columns( - pl.col("col2").fill_null(pl.median("col2")), + pl.col("col2").fill_null((2 * pl.col("col1")).cast(pl.Int64)), ) print(fill_median_df) # --8<-- [end:fillexpr] +# --8<-- [start:fillstrategy] +fill_forward_df = df.with_columns( + pl.col("col2").fill_null(strategy="forward").alias("forward"), + pl.col("col2").fill_null(strategy="backward").alias("backward"), +) +print(fill_forward_df) +# --8<-- [end:fillstrategy] + # --8<-- [start:fillinterpolate] fill_interpolation_df = df.with_columns( pl.col("col2").interpolate(), @@ -68,6 +65,8 @@ # --8<-- [end:fillinterpolate] # --8<-- [start:nan] +import numpy as np + nan_df = pl.DataFrame( { "value": [1.0, np.nan, float("nan"), 3.0], @@ -76,9 +75,23 @@ print(nan_df) # --8<-- [end:nan] +# --8<-- [start:nan-computed] +df = pl.DataFrame( + { + "dividend": [1, 0, -1], + "divisor": [1, 0, -1], + } +) +result = df.select(pl.col("dividend") / pl.col("divisor")) +print(result) +# --8<-- [end:nan-computed] + # --8<-- [start:nanfill] mean_nan_df = nan_df.with_columns( - pl.col("value").fill_nan(None).alias("value"), -).mean() + pl.col("value").fill_nan(None).alias("replaced"), +).select( + pl.all().mean().name.suffix("_mean"), + pl.all().sum().name.suffix("_sum"), +) print(mean_nan_df) # --8<-- [end:nanfill] diff --git a/docs/source/src/python/user-guide/expressions/operations.py b/docs/source/src/python/user-guide/expressions/operations.py new file mode 100644 index 000000000000..822a8b9a1a5c --- /dev/null +++ b/docs/source/src/python/user-guide/expressions/operations.py @@ -0,0 +1,131 @@ +# --8<-- [start:dataframe] +import polars as pl +import numpy as np + +np.random.seed(42) # For reproducibility. + +df = pl.DataFrame( + { + "nrs": [1, 2, 3, None, 5], + "names": ["foo", "ham", "spam", "egg", "spam"], + "random": np.random.rand(5), + "groups": ["A", "A", "B", "A", "B"], + } +) +print(df) +# --8<-- [end:dataframe] + +# --8<-- [start:arithmetic] +result = df.select( + (pl.col("nrs") + 5).alias("nrs + 5"), + (pl.col("nrs") - 5).alias("nrs - 5"), + (pl.col("nrs") * pl.col("random")).alias("nrs * random"), + (pl.col("nrs") / pl.col("random")).alias("nrs / random"), + (pl.col("nrs") ** 2).alias("nrs ** 2"), + (pl.col("nrs") % 3).alias("nrs % 3"), +) + +print(result) +# --8<-- [end:arithmetic] + +# --8<-- [start:operator-overloading] +result_named_operators = df.select( + (pl.col("nrs").add(5)).alias("nrs + 5"), + (pl.col("nrs").sub(5)).alias("nrs - 5"), + (pl.col("nrs").mul(pl.col("random"))).alias("nrs * random"), + (pl.col("nrs").truediv(pl.col("random"))).alias("nrs / random"), + (pl.col("nrs").pow(2)).alias("nrs ** 2"), + (pl.col("nrs").mod(3)).alias("nrs % 3"), +) + +print(result.equals(result_named_operators)) +# --8<-- [end:operator-overloading] + +# --8<-- [start:comparison] +result = df.select( + (pl.col("nrs") > 1).alias("nrs > 1"), # .gt + (pl.col("nrs") >= 3).alias("nrs >= 3"), # ge + (pl.col("random") < 0.2).alias("random < .2"), # .lt + (pl.col("random") <= 0.5).alias("random <= .5"), # .le + (pl.col("nrs") != 1).alias("nrs != 1"), # .ne + (pl.col("nrs") == 1).alias("nrs == 1"), # .eq +) +print(result) +# --8<-- [end:comparison] + +# --8<-- [start:boolean] +# Boolean operators & | ~ +result = df.select( + ((~pl.col("nrs").is_null()) & (pl.col("groups") == "A")).alias( + "number not null and group A" + ), + ((pl.col("random") < 0.5) | (pl.col("groups") == "B")).alias( + "random < 0.5 or group B" + ), +) + +print(result) + +# Corresponding named functions `and_`, `or_`, and `not_`. +result2 = df.select( + (pl.col("nrs").is_null().not_().and_(pl.col("groups") == "A")).alias( + "number not null and group A" + ), + ((pl.col("random") < 0.5).or_(pl.col("groups") == "B")).alias( + "random < 0.5 or group B" + ), +) +print(result.equals(result2)) +# --8<-- [end:boolean] + +# --8<-- [start:bitwise] +result = df.select( + pl.col("nrs"), + (pl.col("nrs") & 6).alias("nrs & 6"), + (pl.col("nrs") | 6).alias("nrs | 6"), + (~pl.col("nrs")).alias("~nrs"), + (pl.col("nrs") ^ 6).alias("nrs ^ 6"), +) + +print(result) +# --8<-- [end:bitwise] + +# --8<-- [start:count] +long_df = pl.DataFrame({"numbers": np.random.randint(0, 100_000, 100_000)}) + +result = long_df.select( + pl.col("numbers").n_unique().alias("n_unique"), + pl.col("numbers").approx_n_unique().alias("approx_n_unique"), +) + +print(result) +# --8<-- [end:count] + +# --8<-- [start:value_counts] +result = df.select( + pl.col("names").value_counts().alias("value_counts"), +) + +print(result) +# --8<-- [end:value_counts] + +# --8<-- [start:unique_counts] +result = df.select( + pl.col("names").unique(maintain_order=True).alias("unique"), + pl.col("names").unique_counts().alias("unique_counts"), +) + +print(result) +# --8<-- [end:unique_counts] + +# --8<-- [start:collatz] +result = df.select( + pl.col("nrs"), + pl.when(pl.col("nrs") % 2 == 1) # Is the number odd? + .then(3 * pl.col("nrs") + 1) # If so, multiply by 3 and add 1. + .otherwise(pl.col("nrs") // 2) # If not, divide by 2. + .alias("Collatz"), +) + +print(result) +# --8<-- [end:collatz] diff --git a/docs/source/src/python/user-guide/expressions/operators.py b/docs/source/src/python/user-guide/expressions/operators.py deleted file mode 100644 index 92bf57952332..000000000000 --- a/docs/source/src/python/user-guide/expressions/operators.py +++ /dev/null @@ -1,44 +0,0 @@ -# --8<-- [start:setup] - -import polars as pl -import numpy as np - -np.random.seed(12) -# --8<-- [end:setup] - - -# --8<-- [start:dataframe] -df = pl.DataFrame( - { - "nrs": [1, 2, 3, None, 5], - "names": ["foo", "ham", "spam", "egg", None], - "random": np.random.rand(5), - "groups": ["A", "A", "B", "C", "B"], - } -) -print(df) -# --8<-- [end:dataframe] - -# --8<-- [start:numerical] - -df_numerical = df.select( - (pl.col("nrs") + 5).alias("nrs + 5"), - (pl.col("nrs") - 5).alias("nrs - 5"), - (pl.col("nrs") * pl.col("random")).alias("nrs * random"), - (pl.col("nrs") / pl.col("random")).alias("nrs / random"), -) -print(df_numerical) - -# --8<-- [end:numerical] - -# --8<-- [start:logical] -df_logical = df.select( - (pl.col("nrs") > 1).alias("nrs > 1"), - (pl.col("random") <= 0.5).alias("random <= .5"), - (pl.col("nrs") != 1).alias("nrs != 1"), - (pl.col("nrs") == 1).alias("nrs == 1"), - ((pl.col("random") <= 0.5) & (pl.col("nrs") > 1)).alias("and_expr"), # and - ((pl.col("random") <= 0.5) | (pl.col("nrs") > 1)).alias("or_expr"), # or -) -print(df_logical) -# --8<-- [end:logical] diff --git a/docs/source/src/python/user-guide/expressions/strings.py b/docs/source/src/python/user-guide/expressions/strings.py index 379c20358feb..a68d032d702f 100644 --- a/docs/source/src/python/user-guide/expressions/strings.py +++ b/docs/source/src/python/user-guide/expressions/strings.py @@ -1,61 +1,112 @@ -# --8<-- [start:setup] +# --8<-- [start:df] import polars as pl -# --8<-- [end:setup] - - -# --8<-- [start:df] -df = pl.DataFrame({"animal": ["Crab", "cat and dog", "rab$bit", None]}) +df = pl.DataFrame( + { + "language": ["English", "Dutch", "Portuguese", "Finish"], + "fruit": ["pear", "peer", "pêra", "päärynä"], + } +) -out = df.select( - pl.col("animal").str.len_bytes().alias("byte_count"), - pl.col("animal").str.len_chars().alias("letter_count"), +result = df.with_columns( + pl.col("fruit").str.len_bytes().alias("byte_count"), + pl.col("fruit").str.len_chars().alias("letter_count"), ) -print(out) +print(result) # --8<-- [end:df] # --8<-- [start:existence] -out = df.select( - pl.col("animal"), - pl.col("animal").str.contains("cat|bit").alias("regex"), - pl.col("animal").str.contains("rab$", literal=True).alias("literal"), - pl.col("animal").str.starts_with("rab").alias("starts_with"), - pl.col("animal").str.ends_with("dog").alias("ends_with"), -) -print(out) +result = df.select( + pl.col("fruit"), + pl.col("fruit").str.starts_with("p").alias("starts_with_p"), + pl.col("fruit").str.contains("p..r").alias("p..r"), + pl.col("fruit").str.contains("e+").alias("e+"), + pl.col("fruit").str.ends_with("r").alias("ends_with_r"), +) +print(result) # --8<-- [end:existence] # --8<-- [start:extract] df = pl.DataFrame( { - "a": [ + "urls": [ "http://vote.com/ballon_dor?candidate=messi&ref=polars", "http://vote.com/ballon_dor?candidat=jorginho&ref=polars", "http://vote.com/ballon_dor?candidate=ronaldo&ref=polars", ] } ) -out = df.select( - pl.col("a").str.extract(r"candidate=(\w+)", group_index=1), +result = df.select( + pl.col("urls").str.extract(r"candidate=(\w+)", group_index=1), ) -print(out) +print(result) # --8<-- [end:extract] # --8<-- [start:extract_all] -df = pl.DataFrame({"foo": ["123 bla 45 asd", "xyz 678 910t"]}) -out = df.select( - pl.col("foo").str.extract_all(r"(\d+)").alias("extracted_nrs"), +df = pl.DataFrame({"text": ["123 bla 45 asd", "xyz 678 910t"]}) +result = df.select( + pl.col("text").str.extract_all(r"(\d+)").alias("extracted_nrs"), ) -print(out) +print(result) # --8<-- [end:extract_all] # --8<-- [start:replace] -df = pl.DataFrame({"id": [1, 2], "text": ["123abc", "abc456"]}) -out = df.with_columns( - pl.col("text").str.replace(r"abc\b", "ABC"), - pl.col("text").str.replace_all("a", "-", literal=True).alias("text_replace_all"), +df = pl.DataFrame({"text": ["123abc", "abc456"]}) +result = df.with_columns( + pl.col("text").str.replace(r"\d", "-"), + pl.col("text").str.replace_all(r"\d", "-").alias("text_replace_all"), ) -print(out) +print(result) # --8<-- [end:replace] + +# --8<-- [start:casing] +addresses = pl.DataFrame( + { + "addresses": [ + "128 PERF st", + "Rust blVD, 158", + "PoLaRs Av, 12", + "1042 Query sq", + ] + } +) + +addresses = addresses.select( + pl.col("addresses").alias("originals"), + pl.col("addresses").str.to_titlecase(), + pl.col("addresses").str.to_lowercase().alias("lower"), + pl.col("addresses").str.to_uppercase().alias("upper"), +) +print(addresses) +# --8<-- [end:casing] + +# --8<-- [start:strip] +addr = pl.col("addresses") +chars = ", 0123456789" +result = addresses.select( + addr.str.strip_chars(chars).alias("strip"), + addr.str.strip_chars_end(chars).alias("end"), + addr.str.strip_chars_start(chars).alias("start"), + addr.str.strip_prefix("128 ").alias("prefix"), + addr.str.strip_suffix(", 158").alias("suffix"), +) +print(result) +# --8<-- [end:strip] + +# --8<-- [start:slice] +df = pl.DataFrame( + { + "fruits": ["pear", "mango", "dragonfruit", "passionfruit"], + "n": [1, -1, 4, -4], + } +) + +result = df.with_columns( + pl.col("fruits").str.slice(pl.col("n")).alias("slice"), + pl.col("fruits").str.head(pl.col("n")).alias("head"), + pl.col("fruits").str.tail(pl.col("n")).alias("tail"), +) +print(result) +# --8<-- [end:slice] diff --git a/docs/source/src/python/user-guide/expressions/structs.py b/docs/source/src/python/user-guide/expressions/structs.py index 232ccea9b8c4..f500343b428d 100644 --- a/docs/source/src/python/user-guide/expressions/structs.py +++ b/docs/source/src/python/user-guide/expressions/structs.py @@ -1,28 +1,25 @@ -# --8<-- [start:setup] +# --8<-- [start:ratings_df] import polars as pl -# --8<-- [end:setup] - -# --8<-- [start:ratings_df] ratings = pl.DataFrame( { - "Movie": ["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "ET"], - "Theatre": ["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "SD"], - "Avg_Rating": [4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.7, 4.9, 4.7, 4.6], - "Count": [30, 27, 26, 29, 31, 28, 28, 26, 33, 26], + "Movie": ["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "Cars"], + "Theatre": ["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "NE"], + "Avg_Rating": [4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.5, 4.9, 4.7, 4.6], + "Count": [30, 27, 26, 29, 31, 28, 28, 26, 33, 28], } ) print(ratings) # --8<-- [end:ratings_df] # --8<-- [start:state_value_counts] -out = ratings.select(pl.col("Theatre").value_counts(sort=True)) -print(out) +result = ratings.select(pl.col("Theatre").value_counts(sort=True)) +print(result) # --8<-- [end:state_value_counts] # --8<-- [start:struct_unnest] -out = ratings.select(pl.col("Theatre").value_counts(sort=True)).unnest("Theatre") -print(out) +result = ratings.select(pl.col("Theatre").value_counts(sort=True)).unnest("Theatre") +print(result) # --8<-- [end:struct_unnest] # --8<-- [start:series_struct] @@ -36,43 +33,87 @@ print(rating_series) # --8<-- [end:series_struct] +# --8<-- [start:series_struct_error] +null_rating_series = pl.Series( + "ratings", + [ + {"Movie": "Cars", "Theatre": "NE", "Avg_Rating": 4.5}, + {"Mov": "Toy Story", "Theatre": "ME", "Avg_Rating": 4.9}, + {"Movie": "Snow White", "Theatre": "IL", "Avg_Rating": "4.7"}, + ], + strict=False, # To show the final structs with `null` values. +) +print(null_rating_series) +# --8<-- [end:series_struct_error] + # --8<-- [start:series_struct_extract] -out = rating_series.struct.field("Movie") -print(out) +result = rating_series.struct.field("Movie") +print(result) # --8<-- [end:series_struct_extract] # --8<-- [start:series_struct_rename] -out = ( - rating_series.to_frame() - .select(pl.col("ratings").struct.rename_fields(["Film", "State", "Value"])) - .unnest("ratings") -) -print(out) +result = rating_series.struct.rename_fields(["Film", "State", "Value"]) +print(result) # --8<-- [end:series_struct_rename] +# --8<-- [start:struct-rename-check] +print( + result.to_frame().unnest("ratings"), +) +# --8<-- [end:struct-rename-check] + # --8<-- [start:struct_duplicates] -out = ratings.filter(pl.struct("Movie", "Theatre").is_duplicated()) -print(out) +result = ratings.filter(pl.struct("Movie", "Theatre").is_duplicated()) +print(result) # --8<-- [end:struct_duplicates] # --8<-- [start:struct_ranking] -out = ratings.with_columns( +result = ratings.with_columns( pl.struct("Count", "Avg_Rating") .rank("dense", descending=True) .over("Movie", "Theatre") .alias("Rank") ).filter(pl.struct("Movie", "Theatre").is_duplicated()) -print(out) + +print(result) # --8<-- [end:struct_ranking] # --8<-- [start:multi_column_apply] df = pl.DataFrame({"keys": ["a", "a", "b"], "values": [10, 7, 1]}) -out = df.select( +result = df.select( pl.struct(["keys", "values"]) .map_elements(lambda x: len(x["keys"]) + x["values"], return_dtype=pl.Int64) .alias("solution_map_elements"), (pl.col("keys").str.len_bytes() + pl.col("values")).alias("solution_expr"), ) -print(out) +print(result) # --8<-- [end:multi_column_apply] + + +# --8<-- [start:ack] +def ack(m, n): + if not m: + return n + 1 + if not n: + return ack(m - 1, 1) + return ack(m - 1, ack(m, n - 1)) + + +# --8<-- [end:ack] + +# --8<-- [start:struct-ack] +values = pl.DataFrame( + { + "m": [0, 0, 0, 1, 1, 1, 2], + "n": [2, 3, 4, 1, 2, 3, 1], + } +) +result = values.with_columns( + pl.struct(["m", "n"]) + .map_elements(lambda s: ack(s["m"], s["n"]), return_dtype=pl.Int64) + .alias("ack") +) + +print(result) +# --8<-- [end:struct-ack] diff --git a/docs/source/src/python/user-guide/expressions/window.py b/docs/source/src/python/user-guide/expressions/window.py index 9ed9ce5d4f88..f82da48d75f1 100644 --- a/docs/source/src/python/user-guide/expressions/window.py +++ b/docs/source/src/python/user-guide/expressions/window.py @@ -1,16 +1,103 @@ # --8<-- [start:pokemon] import polars as pl -# then let's load some csv data with information about pokemon -df = pl.read_csv( - "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv" +types = ( + "Grass Water Fire Normal Ground Electric Psychic Fighting Bug Steel " + "Flying Dragon Dark Ghost Poison Rock Ice Fairy".split() ) -print(df.head()) +type_enum = pl.Enum(types) +# then let's load some csv data with information about pokemon +pokemon = pl.read_csv( + "https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv", +).cast({"Type 1": type_enum, "Type 2": type_enum}) +print(pokemon.head()) # --8<-- [end:pokemon] +# --8<-- [start:rank] +result = pokemon.select( + pl.col("Name", "Type 1"), + pl.col("Speed").rank("dense", descending=True).over("Type 1").alias("Speed rank"), +) + +print(result) +# --8<-- [end:rank] + +# --8<-- [start:rank-multiple] +result = pokemon.select( + pl.col("Name", "Type 1", "Type 2"), + pl.col("Speed") + .rank("dense", descending=True) + .over("Type 1", "Type 2") + .alias("Speed rank"), +) + +print(result) +# --8<-- [end:rank-multiple] + +# --8<-- [start:rank-explode] +result = ( + pokemon.group_by("Type 1") + .agg( + pl.col("Name"), + pl.col("Speed").rank("dense", descending=True).alias("Speed rank"), + ) + .select(pl.col("Name"), pl.col("Type 1"), pl.col("Speed rank")) + .explode("Name", "Speed rank") +) + +print(result) +# --8<-- [end:rank-explode] + +# --8<-- [start:athletes] +athletes = pl.DataFrame( + { + "athlete": list("ABCDEF"), + "country": ["PT", "NL", "NL", "PT", "PT", "NL"], + "rank": [6, 1, 5, 4, 2, 3], + } +) +print(athletes) +# --8<-- [end:athletes] + +# --8<-- [start:athletes-sort-over-country] +result = athletes.select( + pl.col("athlete", "rank").sort_by(pl.col("rank")).over(pl.col("country")), + pl.col("country"), +) + +print(result) +# --8<-- [end:athletes-sort-over-country] + +# --8<-- [start:athletes-explode] +result = athletes.select( + pl.all() + .sort_by(pl.col("rank")) + .over(pl.col("country"), mapping_strategy="explode"), +) + +print(result) +# --8<-- [end:athletes-explode] + +# --8<-- [start:athletes-join] +result = athletes.with_columns( + pl.col("rank").sort().over(pl.col("country"), mapping_strategy="join"), +) + +print(result) +# --8<-- [end:athletes-join] + +# --8<-- [start:pokemon-mean] +result = pokemon.select( + pl.col("Name", "Type 1", "Speed"), + pl.col("Speed").mean().over(pl.col("Type 1")).alias("Mean speed in group"), +) + +print(result) +# --8<-- [end:pokemon-mean] + # --8<-- [start:group_by] -out = df.select( +result = pokemon.select( "Type 1", "Type 2", pl.col("Attack").mean().over("Type 1").alias("avg_attack_by_type"), @@ -20,11 +107,11 @@ .alias("avg_defense_by_type_combination"), pl.col("Attack").mean().alias("avg_attack"), ) -print(out) +print(result) # --8<-- [end:group_by] # --8<-- [start:operations] -filtered = df.filter(pl.col("Type 2") == "Psychic").select( +filtered = pokemon.filter(pl.col("Type 2") == "Psychic").select( "Name", "Type 1", "Speed", @@ -33,36 +120,14 @@ # --8<-- [end:operations] # --8<-- [start:sort] -out = filtered.with_columns( +result = filtered.with_columns( pl.col("Name", "Speed").sort_by("Speed", descending=True).over("Type 1"), ) -print(out) +print(result) # --8<-- [end:sort] -# --8<-- [start:rules] -# aggregate and broadcast within a group -# output type: -> Int32 -pl.sum("foo").over("groups") - -# sum within a group and multiply with group elements -# output type: -> Int32 -(pl.col("x").sum() * pl.col("y")).over("groups") - -# sum within a group and multiply with group elements -# and aggregate the group to a list -# output type: -> List(Int32) -(pl.col("x").sum() * pl.col("y")).over("groups", mapping_strategy="join") - -# sum within a group and multiply with group elements -# and aggregate the group to a list -# then explode the list to multiple rows - -# This is the fastest method to do things over groups when the groups are sorted -(pl.col("x").sum() * pl.col("y")).over("groups", mapping_strategy="explode") -# --8<-- [end:rules] - # --8<-- [start:examples] -out = df.sort("Type 1").select( +result = pokemon.sort("Type 1").select( pl.col("Type 1").head(3).over("Type 1", mapping_strategy="explode"), pl.col("Name") .sort_by(pl.col("Speed"), descending=True) @@ -80,5 +145,5 @@ .over("Type 1", mapping_strategy="explode") .alias("sorted_by_alphabet"), ) -print(out) +print(result) # --8<-- [end:examples] diff --git a/docs/source/src/rust/Cargo.toml b/docs/source/src/rust/Cargo.toml index 8a6607d4aa84..28afe176086e 100644 --- a/docs/source/src/rust/Cargo.toml +++ b/docs/source/src/rust/Cargo.toml @@ -34,10 +34,6 @@ required-features = ["polars/lazy", "polars/temporal", "polars/round_series", "p name = "user-guide-concepts-data-types-and-structures" path = "user-guide/concepts/data-types-and-structures.rs" -[[bin]] -name = "user-guide-concepts-contexts" -path = "user-guide/concepts/contexts.rs" -required-features = ["polars/lazy"] [[bin]] name = "user-guide-concepts-expressions" path = "user-guide/concepts/expressions.rs" @@ -68,9 +64,9 @@ name = "user-guide-expressions-folds" path = "user-guide/expressions/folds.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-expressions-functions" -path = "user-guide/expressions/functions.rs" -required-features = ["polars/lazy"] +name = "user-guide-expressions-expression-expansion" +path = "user-guide/expressions/expression-expansion.rs" +required-features = [] [[bin]] name = "user-guide-expressions-lists" path = "user-guide/expressions/lists.rs" @@ -80,8 +76,8 @@ name = "user-guide-expressions-missing-data" path = "user-guide/expressions/missing-data.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-expressions-operators" -path = "user-guide/expressions/operators.rs" +name = "user-guide-expressions-operations" +path = "user-guide/expressions/operations.rs" required-features = ["polars/lazy"] [[bin]] name = "user-guide-expressions-strings" @@ -92,10 +88,6 @@ name = "user-guide-expressions-structs" path = "user-guide/expressions/structs.rs" required-features = ["polars/lazy"] [[bin]] -name = "user-guide-expressions-user-defined-functions" -path = "user-guide/expressions/user-defined-functions.rs" -required-features = ["polars/lazy"] -[[bin]] name = "user-guide-expressions-window" path = "user-guide/expressions/window.rs" required-features = ["polars/lazy"] diff --git a/docs/source/src/rust/user-guide/concepts/contexts.rs b/docs/source/src/rust/user-guide/concepts/contexts.rs deleted file mode 100644 index 1ff1114d4bf0..000000000000 --- a/docs/source/src/rust/user-guide/concepts/contexts.rs +++ /dev/null @@ -1,70 +0,0 @@ -use polars::prelude::*; - -fn main() -> Result<(), Box> { - // --8<-- [start:dataframe] - use rand::{thread_rng, Rng}; - - let mut arr = [0f64; 5]; - thread_rng().fill(&mut arr); - - let df = df! ( - "nrs" => &[Some(1), Some(2), Some(3), None, Some(5)], - "names" => &[Some("foo"), Some("ham"), Some("spam"), Some("eggs"), None], - "random" => &arr, - "groups" => &["A", "A", "B", "C", "B"], - )?; - - println!("{}", &df); - // --8<-- [end:dataframe] - - // --8<-- [start:select] - let out = df - .clone() - .lazy() - .select([ - sum("nrs"), - col("names").sort(Default::default()), - col("names").first().alias("first name"), - (mean("nrs") * lit(10)).alias("10xnrs"), - ]) - .collect()?; - println!("{}", out); - // --8<-- [end:select] - - // --8<-- [start:filter] - let out = df.clone().lazy().filter(col("nrs").gt(lit(2))).collect()?; - println!("{}", out); - // --8<-- [end:filter] - - // --8<-- [start:with_columns] - let out = df - .clone() - .lazy() - .with_columns([ - sum("nrs").alias("nrs_sum"), - col("random").count().alias("count"), - ]) - .collect()?; - println!("{}", out); - // --8<-- [end:with_columns] - - // --8<-- [start:group_by] - let out = df - .lazy() - .group_by([col("groups")]) - .agg([ - sum("nrs"), // sum nrs by groups - col("random").count().alias("count"), // count group members - // sum random where name != null - col("random") - .filter(col("names").is_not_null()) - .sum() - .name() - .suffix("_sum"), - col("names").reverse().alias("reversed names"), - ]) - .collect()?; - println!("{}", out); - // --8<-- [end:group_by] - Ok(()) -} diff --git a/docs/source/src/rust/user-guide/expressions/aggregation.rs b/docs/source/src/rust/user-guide/expressions/aggregation.rs index 9436565330bf..43f60c0c6402 100644 --- a/docs/source/src/rust/user-guide/expressions/aggregation.rs +++ b/docs/source/src/rust/user-guide/expressions/aggregation.rs @@ -1,9 +1,8 @@ -use polars::prelude::*; - fn main() -> Result<(), Box> { // --8<-- [start:dataframe] use std::io::Cursor; + use polars::prelude::*; use reqwest::blocking::Client; let url = "https://theunitedstates.io/congress-legislators/legislators-historical.csv"; @@ -89,7 +88,7 @@ fn main() -> Result<(), Box> { .clone() .lazy() .group_by(["state", "party"]) - .agg([col("party").count().alias("count")]) + .agg([len().count().alias("count")]) .filter( col("party") .eq(lit("Anti-Administration")) @@ -135,6 +134,9 @@ fn main() -> Result<(), Box> { println!("{}", df); // --8<-- [end:filter] + // --8<-- [start:filter-nested] + // --8<-- [end:filter-nested] + // --8<-- [start:sort] fn get_person() -> Expr { col("first_name") + lit(" ") + col("last_name") diff --git a/docs/source/src/rust/user-guide/expressions/column-selections.rs b/docs/source/src/rust/user-guide/expressions/column-selections.rs index c0f3f35ac3b0..0dff5ab38c62 100644 --- a/docs/source/src/rust/user-guide/expressions/column-selections.rs +++ b/docs/source/src/rust/user-guide/expressions/column-selections.rs @@ -79,7 +79,7 @@ fn main() -> Result<(), Box> { // --8<-- [start:selectors_by_name] // Not available in Rust, refer the following link - // https://github.com/pola-rs/polars/issues/1059 + // https://github.com/pola-rs/polars/issues/10594 // --8<-- [end:selectors_by_name] // --8<-- [start:selectors_to_expr] diff --git a/docs/source/src/rust/user-guide/expressions/expression-expansion.rs b/docs/source/src/rust/user-guide/expressions/expression-expansion.rs new file mode 100644 index 000000000000..7f3f84bf7b59 --- /dev/null +++ b/docs/source/src/rust/user-guide/expressions/expression-expansion.rs @@ -0,0 +1,87 @@ +fn main() -> Result<(), Box> { + // --8<-- [start:df] + // use polars::prelude::*; + // --8<-- [end:df] + + // --8<-- [start:col-with-names] + // --8<-- [end:col-with-names] + + // --8<-- [start:expression-list] + // --8<-- [end:expression-list] + + // --8<-- [start:col-with-dtype] + // --8<-- [end:col-with-dtype] + + // --8<-- [start:col-with-dtypes] + // --8<-- [end:col-with-dtypes] + + // --8<-- [start:col-with-regex] + // --8<-- [end:col-with-regex] + + // --8<-- [start:col-error] + // --8<-- [end:col-error] + + // --8<-- [start:all] + // --8<-- [end:all] + + // --8<-- [start:all-exclude] + // --8<-- [end:all-exclude] + + // --8<-- [start:col-exclude] + // --8<-- [end:col-exclude] + + // --8<-- [start:duplicate-error] + // --8<-- [end:duplicate-error] + + // --8<-- [start:alias] + // --8<-- [end:alias] + + // --8<-- [start:prefix-suffix] + // --8<-- [end:prefix-suffix] + + // --8<-- [start:name-map] + // --8<-- [end:name-map] + + // --8<-- [start:for-with_columns] + // --8<-- [end:for-with_columns] + + // --8<-- [start:yield-expressions] + // --8<-- [end:yield-expressions] + + // --8<-- [start:selectors] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors] + + // --8<-- [start:selectors-set-operations] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors-set-operations] + + // --8<-- [start:selectors-expressions] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selectors-expressions] + + // --8<-- [start:selector-ambiguity] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:selector-ambiguity] + + // --8<-- [start:as_expr] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:as_expr] + + // --8<-- [start:is_selector] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:is_selector] + + // --8<-- [start:expand_selector] + // Selectors are not available in Rust yet. + // Refer to https://github.com/pola-rs/polars/issues/10594 + // --8<-- [end:expand_selector] + + Ok(()) +} diff --git a/docs/source/src/rust/user-guide/expressions/folds.rs b/docs/source/src/rust/user-guide/expressions/folds.rs index e7df220e0644..f8e45f1ad3a1 100644 --- a/docs/source/src/rust/user-guide/expressions/folds.rs +++ b/docs/source/src/rust/user-guide/expressions/folds.rs @@ -1,7 +1,6 @@ -use polars::prelude::*; - fn main() -> Result<(), Box> { // --8<-- [start:mansum] + use polars::prelude::*; let df = df!( "a" => &[1, 2, 3], "b" => &[10, 20, 30], @@ -13,6 +12,13 @@ fn main() -> Result<(), Box> { .collect()?; println!("{}", out); // --8<-- [end:mansum] + // --8<-- [start:mansum-explicit] + // --8<-- [end:mansum-explicit] + + // --8<-- [start:manprod] + // --8<-- [end:manprod] + // --8<-- [start:manprod-fixed] + // --8<-- [end:manprod-fixed] // --8<-- [start:conditional] let df = df!( diff --git a/docs/source/src/rust/user-guide/expressions/functions.rs b/docs/source/src/rust/user-guide/expressions/functions.rs deleted file mode 100644 index 490809b75557..000000000000 --- a/docs/source/src/rust/user-guide/expressions/functions.rs +++ /dev/null @@ -1,79 +0,0 @@ -use polars::prelude::*; - -fn main() -> Result<(), Box> { - // --8<-- [start:dataframe] - use rand::{thread_rng, Rng}; - - let mut arr = [0f64; 5]; - thread_rng().fill(&mut arr); - - let df = df! ( - "nrs" => &[Some(1), Some(2), Some(3), None, Some(5)], - "names" => &["foo", "ham", "spam", "egg", "spam"], - "random" => &arr, - "groups" => &["A", "A", "B", "C", "B"], - )?; - - println!("{}", &df); - // --8<-- [end:dataframe] - - // --8<-- [start:samename] - let df_samename = df.clone().lazy().select([col("nrs") + lit(5)]).collect()?; - println!("{}", &df_samename); - // --8<-- [end:samename] - - // --8<-- [start:samenametwice] - let df_samename2 = df - .clone() - .lazy() - .select([col("nrs") + lit(5), col("nrs") - lit(5)]) - .collect(); - match df_samename2 { - Ok(df) => println!("{}", &df), - Err(e) => println!("{:?}", &e), - }; - // --8<-- [end:samenametwice] - - // --8<-- [start:samenamealias] - let df_alias = df - .clone() - .lazy() - .select([ - (col("nrs") + lit(5)).alias("nrs + 5"), - (col("nrs") - lit(5)).alias("nrs - 5"), - ]) - .collect()?; - println!("{}", &df_alias); - // --8<-- [end:samenamealias] - - // --8<-- [start:countunique] - let df_alias = df - .clone() - .lazy() - .select([ - col("names").n_unique().alias("unique"), - // Following query shows there isn't anything in Rust API - // https://docs.rs/polars/latest/polars/?search=approx_n_unique - // col("names").approx_n_unique().alias("unique_approx"), - ]) - .collect()?; - println!("{}", &df_alias); - // --8<-- [end:countunique] - - // --8<-- [start:conditional] - let df_conditional = df - .clone() - .lazy() - .select([ - col("nrs"), - when(col("nrs").gt(2)) - .then(lit(true)) - .otherwise(lit(false)) - .alias("conditional"), - ]) - .collect()?; - println!("{}", &df_conditional); - // --8<-- [end:conditional] - - Ok(()) -} diff --git a/docs/source/src/rust/user-guide/expressions/lists.rs b/docs/source/src/rust/user-guide/expressions/lists.rs index fd097d98df7e..5ac4b32462e7 100644 --- a/docs/source/src/rust/user-guide/expressions/lists.rs +++ b/docs/source/src/rust/user-guide/expressions/lists.rs @@ -1,165 +1,40 @@ -// --8<-- [start:setup] -use polars::prelude::*; -// --8<-- [end:setup] fn main() -> Result<(), Box> { - // --8<-- [start:weather_df] - let stns: Vec = (1..6).map(|i| format!("Station {i}")).collect(); - let weather = df!( - "station"=> &stns, - "temperatures"=> &[ - "20 5 5 E1 7 13 19 9 6 20", - "18 8 16 11 23 E2 8 E2 E2 E2 90 70 40", - "19 24 E9 16 6 12 10 22", - "E2 E0 15 7 8 10 E1 24 17 13 6", - "14 8 E0 16 22 24 E1", - ], - )?; - println!("{}", &weather); - // --8<-- [end:weather_df] + // --8<-- [start:list-example] + // use polars::prelude::*; + // --8<-- [end:list-example] - // --8<-- [start:string_to_list] - let out = weather - .clone() - .lazy() - .with_columns([col("temperatures").str().split(lit(" "))]) - .collect()?; - println!("{}", &out); - // --8<-- [end:string_to_list] + // --8<-- [start:array-example] + // --8<-- [end:array-example] - // --8<-- [start:explode_to_atomic] - let out = weather - .clone() - .lazy() - .with_columns([col("temperatures").str().split(lit(" "))]) - .explode(["temperatures"]) - .collect()?; - println!("{}", &out); - // --8<-- [end:explode_to_atomic] + // --8<-- [start:numpy-array-inference] + // --8<-- [end:numpy-array-inference] - // --8<-- [start:list_ops] - let out = weather - .clone() - .lazy() - .with_columns([col("temperatures").str().split(lit(" "))]) - .with_columns([ - col("temperatures").list().head(lit(3)).alias("top3"), - col("temperatures") - .list() - .slice(lit(-3), lit(3)) - .alias("bottom_3"), - col("temperatures").list().len().alias("obs"), - ]) - .collect()?; - println!("{}", &out); - // --8<-- [end:list_ops] + // --8<-- [start:weather] + // --8<-- [end:weather] - // --8<-- [start:count_errors] - let out = weather - .clone() - .lazy() - .with_columns([col("temperatures") - .str() - .split(lit(" ")) - .list() - .eval(col("").cast(DataType::Int64).is_null(), false) - .list() - .sum() - .alias("errors")]) - .collect()?; - println!("{}", &out); - // --8<-- [end:count_errors] + // --8<-- [start:split] + // --8<-- [end:split] - // --8<-- [start:count_errors_regex] - let out = weather - .clone() - .lazy() - .with_columns([col("temperatures") - .str() - .split(lit(" ")) - .list() - .eval(col("").str().contains(lit("(?i)[a-z]"), false), false) - .list() - .sum() - .alias("errors")]) - .collect()?; - println!("{}", &out); - // --8<-- [end:count_errors_regex] + // --8<-- [start:explode] + // --8<-- [end:explode] - // --8<-- [start:weather_by_day] - let stns: Vec = (1..11).map(|i| format!("Station {i}")).collect(); - let weather_by_day = df!( - "station" => &stns, - "day_1" => &[17, 11, 8, 22, 9, 21, 20, 8, 8, 17], - "day_2" => &[15, 11, 10, 8, 7, 14, 18, 21, 15, 13], - "day_3" => &[16, 15, 24, 24, 8, 23, 19, 23, 16, 10], - )?; - println!("{}", &weather_by_day); - // --8<-- [end:weather_by_day] - - // --8<-- [start:weather_by_day_rank] - let rank_pct = (col("") - .rank( - RankOptions { - method: RankMethod::Average, - descending: true, - }, - None, - ) - .cast(DataType::Float32) - / col("*").count().cast(DataType::Float32)) - .round(2); + // --8<-- [start:list-slicing] + // --8<-- [end:list-slicing] - let out = weather_by_day - .clone() - .lazy() - .with_columns( - // create the list of homogeneous data - [concat_list([all().exclude(["station"])])?.alias("all_temps")], - ) - .select( - // select all columns except the intermediate list - [ - all().exclude(["all_temps"]), - // compute the rank by calling `list.eval` - col("all_temps") - .list() - .eval(rank_pct, true) - .alias("temps_rank"), - ], - ) - .collect()?; + // --8<-- [start:element-wise-casting] + // --8<-- [end:element-wise-casting] - println!("{}", &out); - // --8<-- [end:weather_by_day_rank] + // --8<-- [start:element-wise-regex] + // --8<-- [end:element-wise-regex] - // --8<-- [start:array_df] - let mut col1: ListPrimitiveChunkedBuilder = - ListPrimitiveChunkedBuilder::new("Array_1".into(), 8, 8, DataType::Int32); - col1.append_slice(&[1, 3]); - col1.append_slice(&[2, 5]); - let mut col2: ListPrimitiveChunkedBuilder = - ListPrimitiveChunkedBuilder::new("Array_2".into(), 8, 8, DataType::Int32); - col2.append_slice(&[1, 7, 3]); - col2.append_slice(&[8, 1, 0]); - let array_df = DataFrame::new(vec![ - col1.finish().into_column(), - col2.finish().into_column(), - ])?; + // --8<-- [start:weather_by_day] + // --8<-- [end:weather_by_day] - println!("{}", &array_df); - // --8<-- [end:array_df] + // --8<-- [start:rank_pct] + // --8<-- [end:rank_pct] - // --8<-- [start:array_ops] - let out = array_df - .clone() - .lazy() - .select([ - col("Array_1").list().min().name().suffix("_min"), - col("Array_2").list().sum().name().suffix("_sum"), - ]) - .collect()?; - println!("{}", &out); - // --8<-- [end:array_ops] + // --8<-- [start:array-overview] + // --8<-- [end:array-overview] Ok(()) } diff --git a/docs/source/src/rust/user-guide/expressions/missing-data.rs b/docs/source/src/rust/user-guide/expressions/missing-data.rs index 8d78310cb0a9..c73b011ad0a5 100644 --- a/docs/source/src/rust/user-guide/expressions/missing-data.rs +++ b/docs/source/src/rust/user-guide/expressions/missing-data.rs @@ -1,8 +1,6 @@ -use polars::prelude::*; - fn main() -> Result<(), Box> { // --8<-- [start:dataframe] - + use polars::prelude::*; let df = df! ( "value" => &[Some(1), None], )?; @@ -76,6 +74,9 @@ fn main() -> Result<(), Box> { println!("{}", &nan_df); // --8<-- [end:nan] + // --8<-- [start:nan-computed] + // --8<-- [end:nan-computed] + // --8<-- [start:nanfill] let mean_nan_df = nan_df .clone() diff --git a/docs/source/src/rust/user-guide/expressions/operators.rs b/docs/source/src/rust/user-guide/expressions/operations.rs similarity index 58% rename from docs/source/src/rust/user-guide/expressions/operators.rs rename to docs/source/src/rust/user-guide/expressions/operations.rs index 868d301c2182..c92527709bc7 100644 --- a/docs/source/src/rust/user-guide/expressions/operators.rs +++ b/docs/source/src/rust/user-guide/expressions/operations.rs @@ -1,7 +1,6 @@ -use polars::prelude::*; - fn main() -> Result<(), Box> { // --8<-- [start:dataframe] + use polars::prelude::*; use rand::{thread_rng, Rng}; let mut arr = [0f64; 5]; @@ -9,15 +8,15 @@ fn main() -> Result<(), Box> { let df = df! ( "nrs" => &[Some(1), Some(2), Some(3), None, Some(5)], - "names" => &[Some("foo"), Some("ham"), Some("spam"), Some("eggs"), None], + "names" => &["foo", "ham", "spam", "egg", "spam"], "random" => &arr, - "groups" => &["A", "A", "B", "C", "B"], + "groups" => &["A", "A", "B", "A", "B"], )?; println!("{}", &df); // --8<-- [end:dataframe] - // --8<-- [start:numerical] + // --8<-- [start:arithmetic] let df_numerical = df .clone() .lazy() @@ -26,13 +25,18 @@ fn main() -> Result<(), Box> { (col("nrs") - lit(5)).alias("nrs - 5"), (col("nrs") * col("random")).alias("nrs * random"), (col("nrs") / col("random")).alias("nrs / random"), + (col("nrs").pow(lit(2))).alias("nrs ** 2"), + (col("nrs") % lit(3)).alias("nrs % 3"), ]) .collect()?; println!("{}", &df_numerical); - // --8<-- [end:numerical] + // --8<-- [end:arithmetic] + + // --8<-- [start:operator-overloading] + // --8<-- [end:operator-overloading] - // --8<-- [start:logical] - let df_logical = df + // --8<-- [start:comparison] + let df_comparison = df .clone() .lazy() .select([ @@ -48,7 +52,26 @@ fn main() -> Result<(), Box> { .alias("or_expr"), // or ]) .collect()?; - println!("{}", &df_logical); - // --8<-- [end:logical] + println!("{}", &df_comparison); + // --8<-- [end:comparison] + + // --8<-- [start:boolean] + // --8<-- [end:boolean] + + // --8<-- [start:bitwise] + // --8<-- [end:bitwise] + + // --8<-- [start:count] + // --8<-- [end:count] + + // --8<-- [start:value_counts] + // --8<-- [end:value_counts] + + // --8<-- [start:unique_counts] + // --8<-- [end:unique_counts] + + // --8<-- [start:collatz] + // --8<-- [end:collatz] + Ok(()) } diff --git a/docs/source/src/rust/user-guide/expressions/strings.rs b/docs/source/src/rust/user-guide/expressions/strings.rs index 8ebcfa5d6f22..3243f45b05fe 100644 --- a/docs/source/src/rust/user-guide/expressions/strings.rs +++ b/docs/source/src/rust/user-guide/expressions/strings.rs @@ -89,5 +89,14 @@ fn main() -> Result<(), Box> { println!("{}", &out); // --8<-- [end:replace] + // --8<-- [start:casing] + // --8<-- [end:casing] + + // --8<-- [start:strip] + // --8<-- [end:strip] + + // --8<-- [start:slice] + // --8<-- [end:slice] + Ok(()) } diff --git a/docs/source/src/rust/user-guide/expressions/structs.rs b/docs/source/src/rust/user-guide/expressions/structs.rs index cc6fff831d06..5063e6cf7b41 100644 --- a/docs/source/src/rust/user-guide/expressions/structs.rs +++ b/docs/source/src/rust/user-guide/expressions/structs.rs @@ -1,8 +1,6 @@ -// --8<-- [start:setup] -use polars::prelude::*; -// --8<-- [end:setup] fn main() -> Result<(), Box> { // --8<-- [start:ratings_df] + use polars::prelude::*; let ratings = df!( "Movie"=> &["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "ET"], "Theatre"=> &["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "SD"], @@ -44,6 +42,9 @@ fn main() -> Result<(), Box> { println!("{}", &rating_series); // // --8<-- [end:series_struct] + // --8<-- [start:series_struct_error] + // --8<-- [end:series_struct_error] + // --8<-- [start:series_struct_extract] let out = rating_series.struct_()?.field_by_name("Movie")?; println!("{}", &out); @@ -61,6 +62,9 @@ fn main() -> Result<(), Box> { println!("{}", &out); // --8<-- [end:series_struct_rename] + // --8<-- [start:struct-rename-check] + // --8<-- [end:struct-rename-check] + // --8<-- [start:struct_duplicates] let out = ratings .clone() @@ -142,7 +146,13 @@ fn main() -> Result<(), Box> { ]) .collect()?; println!("{}", out); - // --8<-- [end:multi_column_apply] + + // --8<-- [start:ack] + // --8<-- [end:ack] + + // --8<-- [start:struct-ack] + // --8<-- [end:struct-ack] + Ok(()) } diff --git a/docs/source/src/rust/user-guide/expressions/window.rs b/docs/source/src/rust/user-guide/expressions/window.rs index 6414bc984c09..33ea73eac67a 100644 --- a/docs/source/src/rust/user-guide/expressions/window.rs +++ b/docs/source/src/rust/user-guide/expressions/window.rs @@ -19,6 +19,25 @@ fn main() -> Result<(), Box> { println!("{}", df); // --8<-- [end:pokemon] + // --8<-- [start:rank] + // --8<-- [end:rank] + // --8<-- [start:rank-multiple] + // --8<-- [end:rank-multiple] + // --8<-- [start:rank-explode] + // --8<-- [end:rank-explode] + + // --8<-- [start:athletes] + // --8<-- [end:athletes] + // --8<-- [start:athletes-sort-over-country] + // --8<-- [end:athletes-sort-over-country] + // --8<-- [start:athletes-explode] + // --8<-- [end:athletes-explode] + // --8<-- [start:athletes-join] + // --8<-- [end:athletes-join] + + // --8<-- [start:pokemon-mean] + // --8<-- [end:pokemon-mean] + // --8<-- [start:group_by] let out = df .clone() diff --git a/docs/source/user-guide/concepts/data-types-and-structures.md b/docs/source/user-guide/concepts/data-types-and-structures.md index 2de8120f05a3..9d184790ea4f 100644 --- a/docs/source/user-guide/concepts/data-types-and-structures.md +++ b/docs/source/user-guide/concepts/data-types-and-structures.md @@ -167,8 +167,8 @@ much larger internal representations than 64-bit floats), and thus some error is | `Time` | Represents a time of day. | | `Datetime` | Represents a calendar date and time of day. | | `Duration` | Represents a time duration. | -| `Array` | Arrays with a known, fixed shape per series; akin to numpy arrays. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists.md). | -| `List` | Homogeneous 1D container with variable length. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists.md). | +| `Array` | Arrays with a known, fixed shape per series; akin to numpy arrays. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists-and-arrays.md). | +| `List` | Homogeneous 1D container with variable length. [Learn more about how arrays and lists differ and how to work with both](../expressions/lists-and-arrays.md). | | `Object` | Wraps arbitrary Python objects. | | `Categorical` | Efficient encoding of string data where the categories are inferred at runtime. [Learn more about how categoricals and enums differ and how to work with both](../expressions/categorical-data-and-enums.md). | | `Enum` | Efficient ordered encoding of a set of predetermined string categories. [Learn more about how categoricals and enums differ and how to work with both](../expressions/categorical-data-and-enums.md). | diff --git a/docs/source/user-guide/concepts/expressions-and-contexts.md b/docs/source/user-guide/concepts/expressions-and-contexts.md index 4ec537b71fb9..bee5cd130b45 100644 --- a/docs/source/user-guide/concepts/expressions-and-contexts.md +++ b/docs/source/user-guide/concepts/expressions-and-contexts.md @@ -143,7 +143,7 @@ The last example contained two grouping expressions and three aggregating expres If we look closely, the last aggregating expression mentioned two different columns: “weight” and “height”. Polars expressions support a feature called _expression expansion_. -Expression expansion is like a shorthand notation for when you want to apply the same transform to multiple columns. +Expression expansion is like a shorthand notation for when you want to apply the same transformation to multiple columns. As we have seen, the expression ```python diff --git a/docs/source/user-guide/expressions/aggregation.md b/docs/source/user-guide/expressions/aggregation.md index f4d963606ffb..52c783fc2750 100644 --- a/docs/source/user-guide/expressions/aggregation.md +++ b/docs/source/user-guide/expressions/aggregation.md @@ -1,47 +1,42 @@ # Aggregation -Polars implements a powerful syntax defined not only in its lazy API, but also in its eager API. Let's take a look at what that means. +The Polars [context](../concepts/expressions-and-contexts.md#contexts) `group_by` lets you apply expressions on subsets of columns, as defined by the unique values of the column over which the data is grouped. +This is a very powerful capability that we explore in this section of the user guide. -We can start with the simple [US congress `dataset`](https://github.com/unitedstates/congress-legislators). +We start by reading in a [US congress `dataset`](https://github.com/unitedstates/congress-legislators): {{code_block('user-guide/expressions/aggregation','dataframe',['DataFrame','Categorical'])}} -#### Basic aggregations - -You can easily combine different aggregations by adding multiple expressions in a -`list`. There is no upper bound on the number of aggregations you can do, and you can -make any combination you want. In the snippet below we do the following aggregations: - -Per GROUP `"first_name"` we +```python exec="on" result="text" session="user-guide/expressions" +--8<-- "python/user-guide/expressions/aggregation.py:dataframe" +``` - +## Basic aggregations -- count the number of rows in the group: - - full form: `pl.len()` -- combine the values of gender into a list by omitting an aggregate function: - - full form: `pl.col("gender")` -- get the first value of column `"last_name"` in the group: - - short form: `pl.first("last_name")` (not available in Rust) - - full form: `pl.col("last_name").first()` +You can easily apply multiple expressions to your aggregated values. +Simply list all of the expressions you want inside the function `agg`. +There is no upper bound on the number of aggregations you can do and you can make any combination you want. +In the snippet below we will group the data based on the column “first_name” and then we will apply the following aggregations: - +- count the number of rows in the group (which means we count how many people in the data set have each unique first name); +- combine the values of the column “gender” into a list by referring the column but omitting an aggregate function; and +- get the first value of the column “last_name” within the group. -Besides the aggregation, we immediately sort the result and limit to the top `5` so that -we have a nice summary overview. +After computing the aggregations, we immediately sort the result and limit it to the top five rows so that we have a nice summary overview: {{code_block('user-guide/expressions/aggregation','basic',['group_by'])}} ```python exec="on" result="text" session="user-guide/expressions" ---8<-- "python/user-guide/expressions/aggregation.py:setup" ---8<-- "python/user-guide/expressions/aggregation.py:dataframe" --8<-- "python/user-guide/expressions/aggregation.py:basic" ``` -#### Conditionals +It's that easy! +Let's turn it up a notch. -It's that easy! Let's turn it up a notch. Let's say we want to know how -many delegates of a "state" are "Pro" or "Anti" administration. We could directly query -that in the aggregation without the need of a `lambda` or grooming the `DataFrame`. +## Conditionals + +Let's say we want to know how many delegates of a state are “Pro” or “Anti” administration. +We can query that directly in the aggregation without the need for a `lambda` or grooming the dataframe: {{code_block('user-guide/expressions/aggregation','conditional',['group_by'])}} @@ -49,35 +44,57 @@ that in the aggregation without the need of a `lambda` or grooming the `DataFram --8<-- "python/user-guide/expressions/aggregation.py:conditional" ``` -Similarly, this could also be done with a nested GROUP BY, but that doesn't help show off some of these nice features. 😉 +## Filtering -{{code_block('user-guide/expressions/aggregation','nested',['group_by'])}} +We can also filter the groups. +Let's say we want to compute a mean per group, but we don't want to include all values from that group, and we also don't want to actually filter the rows from the dataframe because we need those rows for another aggregation. + +In the example below we show how this can be done. + +!!! note + + Note that we can define Python functions for clarity. + These functions don't cost us anything because they return Polars expressions, we don't apply a custom function over a series during runtime of the query. + Of course, you can write functions that return expressions in Rust, too. + +{{code_block('user-guide/expressions/aggregation','filter',['group_by'])}} ```python exec="on" result="text" session="user-guide/expressions" ---8<-- "python/user-guide/expressions/aggregation.py:nested" +--8<-- "python/user-guide/expressions/aggregation.py:filter" ``` -#### Filtering +Do the average age values look nonsensical? +That's because we are working with historical data that dates back to the 1800s and we are doing our computations assuming everyone represented in the dataset is still alive and kicking. -We can also filter the groups. Let's say we want to compute a mean per group, but we -don't want to include all values from that group, and we also don't want to filter the -rows from the `DataFrame` (because we need those rows for another aggregation). +## Nested grouping -In the example below we show how this can be done. +The two previous queries could have been done with a nested `group_by`, but that wouldn't have let us show off some of these features. 😉 +To do a nested `group_by`, simply list the columns that will be used for grouping. -!!! note +First, we use a nested `group_by` to figure out how many delegates of a state are “Pro” or “Anti” administration: - Note that we can make Python functions for clarity. These functions don't cost us anything. That is because we only create Polars expressions, we don't apply a custom function over a `Series` during runtime of the query. Of course, you can make functions that return expressions in Rust, too. +{{code_block('user-guide/expressions/aggregation','nested',['group_by'])}} -{{code_block('user-guide/expressions/aggregation','filter',['group_by'])}} +```python exec="on" result="text" session="user-guide/expressions" +--8<-- "python/user-guide/expressions/aggregation.py:nested" +``` + +Next, we use a nested `group_by` to compute the average age of delegates per state and per gender: + +{{code_block('user-guide/expressions/aggregation','filter-nested',['group_by'])}} ```python exec="on" result="text" session="user-guide/expressions" ---8<-- "python/user-guide/expressions/aggregation.py:filter" +--8<-- "python/user-guide/expressions/aggregation.py:filter-nested" ``` -#### Sorting +Note that we get the same results but the format of the data is different. +Depending on the situation, one format may be more suitable than the other. + +## Sorting -It's common to see a `DataFrame` being sorted for the sole purpose of managing the ordering during a GROUP BY operation. Let's say that we want to get the names of the oldest and youngest politicians per state. We could SORT and GROUP BY. +It is common to see a dataframe being sorted for the sole purpose of managing the ordering duing a grouping operation. +Let's say that we want to get the names of the oldest and youngest politicians per state. +We could start by sorting and then grouping: {{code_block('user-guide/expressions/aggregation','sort',['group_by'])}} @@ -85,7 +102,8 @@ It's common to see a `DataFrame` being sorted for the sole purpose of managing t --8<-- "python/user-guide/expressions/aggregation.py:sort" ``` -However, **if** we also want to sort the names alphabetically, this breaks. Luckily we can sort in a `group_by` context separate from the `DataFrame`. +However, if we also want to sort the names alphabetically, we need to perform an extra sort operation. +Luckily, we can sort in a `group_by` context without changing the sorting of the underlying dataframe: {{code_block('user-guide/expressions/aggregation','sort2',['group_by'])}} @@ -93,7 +111,8 @@ However, **if** we also want to sort the names alphabetically, this breaks. Luck --8<-- "python/user-guide/expressions/aggregation.py:sort2" ``` -We can even sort by another column in the `group_by` context. If we want to know if the alphabetically sorted name is male or female we could add: `pl.col("gender").sort_by(get_person()).first()` +We can even sort a column with the order induced by another column, and this also works inside the context `group_by`. +This modification to the previous query lets us check if the delegate with the first name is male or female: {{code_block('user-guide/expressions/aggregation','sort3',['group_by'])}} @@ -101,25 +120,17 @@ We can even sort by another column in the `group_by` context. If we want to know --8<-- "python/user-guide/expressions/aggregation.py:sort3" ``` -### Do not kill parallelization - -!!! warning "Python Users Only" - - The following section is specific to Python, and doesn't apply to Rust. Within Rust, blocks and closures (lambdas) can, and will, be executed concurrently. - -We have all heard that Python is slow, and does "not scale." Besides the overhead of -running "slow" bytecode, Python has to remain within the constraints of the Global -Interpreter Lock (GIL). This means that if you were to use a `lambda` or a custom Python -function to apply during a parallelized phase, Polars speed is capped running Python -code preventing any multiple threads from executing the function. +## Do not kill parallelization -This all feels terribly limiting, especially because we often need those `lambda` functions in a -`.group_by()` step, for example. This approach is still supported by Polars, but -keeping in mind bytecode **and** the GIL costs have to be paid. It is recommended to try to solve your queries using the expression syntax before moving to `lambdas`. If you want to learn more about using `lambdas`, go to the [user defined functions section](./user-defined-functions.md). +!!! warning "Python users only" -### Conclusion + The following section is specific to Python, and doesn't apply to Rust. + Within Rust, blocks and closures (lambdas) can, and will, be executed concurrently. -In the examples above we've seen that we can do a lot by combining expressions. By doing so we delay the use of custom Python functions that slow down the queries (by the slow nature of Python AND the GIL). +Python is generally slower than Rust. +Besides the overhead of running “slow” bytecode, Python has to remain within the constraints of the Global Interpreter Lock (GIL). +This means that if you were to use a `lambda` or a custom Python function to apply during a parallelized phase, Polars' speed is capped running Python code, preventing any multiple threads from executing the function. -If we are missing a type expression let us know by opening a -[feature request](https://github.com/pola-rs/polars/issues/new/choose)! +Polars will try to parallelize the computation of the aggregating functions over the groups, so it is recommended that you avoid using `lambda`s and custom Python functions as much as possible. +Instead, try to stay within the realm of the Polars expression API. +This is not always possible, though, so if you want to learn more about using `lambda`s you can go [the user guide section on using user-defined functions](user-defined-python-functions.md). diff --git a/docs/source/user-guide/expressions/athletes_over_country.svg b/docs/source/user-guide/expressions/athletes_over_country.svg new file mode 100644 index 000000000000..850ebb5da40b --- /dev/null +++ b/docs/source/user-guide/expressions/athletes_over_country.svg @@ -0,0 +1,13 @@ + + + + + + + + ABCDEFPTNLNLPTPTNL615423EBFDACPTNLNLPTPTNL213465NLNL \ No newline at end of file diff --git a/docs/source/user-guide/expressions/athletes_over_country_explode.svg b/docs/source/user-guide/expressions/athletes_over_country_explode.svg new file mode 100644 index 000000000000..b90ac715892a --- /dev/null +++ b/docs/source/user-guide/expressions/athletes_over_country_explode.svg @@ -0,0 +1,13 @@ + + + + + + + + ABCDEFPTNLNLPTPTNL615423EBFDACPTNLNLPTPTNL213465NLNLNL \ No newline at end of file diff --git a/docs/source/user-guide/expressions/basic-operations.md b/docs/source/user-guide/expressions/basic-operations.md new file mode 100644 index 000000000000..a4a9d96a8475 --- /dev/null +++ b/docs/source/user-guide/expressions/basic-operations.md @@ -0,0 +1,121 @@ +# Basic operations + +This section shows how to do basic operations on dataframe columns, like do basic arithmetic calculations, perform comparisons, and other general-purpose operations. +We will use the following dataframe for the examples that follow: + +{{code_block('user-guide/expressions/operations', 'dataframe', ['DataFrame'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:dataframe" +``` + +## Basic arithmetic + +Polars supports basic arithmetic between series of the same length, or between series and literals. +When literals are mixed with series, the literals are broadcast to match the length of the series they are being used with. + +{{code_block('user-guide/expressions/operations', 'arithmetic', ['operators'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:arithmetic" +``` + +The example above shows that when an arithmetic operation takes `null` as one of its operands, the result is `null`. + +Polars uses operator overloading to allow you to use your language's native arithmetic operators within your expressions. +If you prefer, you can use the corresponding named functions, as the snippet below demonstrates: + +{{code_block('user-guide/expressions/operations', 'operator-overloading', ['operators'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:operator-overloading" +``` + +## Comparisons + +Like with arithmetic operations, Polars supports comparisons via the overloaded operators or named functions: + +{{code_block('user-guide/expressions/operations','comparison',['operators'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:comparison" +``` + +## Boolean and bitwise operations + +The operators `&`, `|`, and `~`, are used for the Boolean operations “and”, “or”, and “not”, respectively, alongside the functions of the same name: + +{{code_block('user-guide/expressions/operations', 'boolean', ['operators'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:boolean" +``` + +??? info "Python trivia" + + The Python functions are called `and_`, `or_`, and `not_`, because the words `and`, `or`, and `not` are reserved keywords in Python. + Similarly, we cannot use the keywords `and`, `or`, and `not`, as the Boolean operators because these Python keywords will interpret their operands in the context of Truthy and Falsy through the dunder method `__bool__`. + Thus, we overload the bitwise operators `&`, `|`, and `~`, as the Boolean operators because they are the second best choice. + +These operators/functions can also be used for the respective bitwise operations, alongside the bitwise operator `^` / function `xor`: + +{{code_block('user-guide/expressions/operations', 'bitwise', [])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:bitwise" +``` + +## Counting (unique) values + +Polars has two functions to count the number of unique values in a series. +The function `n_unique` can be used to count the exact number of unique values in a series. +However, for very large data sets, this operation can be quite slow. +In those cases, if an approximation is good enough, you can use the function `approx_n_unique` that uses the algorithm [HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog) to estimate the result. + +The example below shows an example series where the `approx_n_unique` estimation is wrong by 0.9%: + +{{code_block('user-guide/expressions/operations', 'count', ['n_unique', 'approx_n_unique'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:count" +``` + +You can get more information about the unique values and their counts with the function `value_counts`, that Polars also provides: + +{{code_block('user-guide/expressions/operations', 'value_counts', ['value_counts'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:value_counts" +``` + +The function `value_counts` returns the results in [structs, a data type that we will explore in a later section](structs.md). + +Alternatively, if you only need a series with the unique values or a series with the unique counts, they are one function away: + +{{code_block('user-guide/expressions/operations', 'unique_counts', ['unique', 'unique_counts'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:unique_counts" +``` + +Note that we need to specify `maintain_order=True` in the function `unique` so that the order of the results is consistent with the order of the results in `unique_counts`. +See the API reference for more information. + +## Conditionals + +Polars supports something akin to a ternary operator through the function `when`, which is followed by one function `then` and an optional function `otherwise`. + +The function `when` accepts a predicate expression. +The values that evaluate to `True` are replaced by the corresponding values of the expression inside the function `then`. +The values that evaluate to `False` are replaced by the corresponding values of the expression inside the function `otherwise` or `null`, if `otherwise` is not provided. + +The example below applies one step of the [Collatz conjecture](https://en.wikipedia.org/wiki/Collatz_conjecture) to the numbers in the column “nrs”: + +{{code_block('user-guide/expressions/operations', 'collatz', ['when'])}} + +```python exec="on" result="text" session="expressions/operations" +--8<-- "python/user-guide/expressions/operations.py:collatz" +``` + +You can also emulate a chain of an arbitrary number of conditionals, akin to Python's `elif` statement, by chaining an arbitrary number of consecutive blocks of `.when(...).then(...)`. +In those cases, and for each given value, Polars will only consider a replacement expression that is deeper within the chain if the previous predicates all failed for that value. diff --git a/docs/source/user-guide/expressions/casting.md b/docs/source/user-guide/expressions/casting.md index f0c625d19f28..c9a79139d37e 100644 --- a/docs/source/user-guide/expressions/casting.md +++ b/docs/source/user-guide/expressions/casting.md @@ -1,101 +1,116 @@ # Casting -Casting converts the [underlying `DataType` of a column](../concepts/data-types-and-structures.md) to a new one. -Casting is available with the `cast()` method. +Casting converts the [underlying data type of a column](../concepts/data-types-and-structures.md) to a new one. +Casting is available through the function `cast`. -The `cast` method includes a `strict` parameter that determines how Polars behaves when it encounters a value that can't be converted from the source `DataType` to the target `DataType`. By default, `strict=True`, which means that Polars will throw an error to notify the user of the failed conversion and provide details on the values that couldn't be cast. On the other hand, if `strict=False`, any values that can't be converted to the target `DataType` will be quietly converted to `null`. +The function `cast` includes a parameter `strict` that determines how Polars behaves when it encounters a value that cannot be converted from the source data type to the target data type. +The default behaviour is `strict=True`, which means that Polars will thrown an error to notify the user of the failed conversion while also providing details on the values that couldn't be cast. +On the other hand, if `strict=False`, any values that cannot be converted to the target data type will be quietly converted to `null`. -## Numerics +## Basic example -Let's take a look at the following `DataFrame` which contains both integers and floating point numbers. +Let's take a look at the following dataframe which contains both integers and floating point numbers: -{{code_block('user-guide/expressions/casting','dfnum',['DataFrame'])}} +{{code_block('user-guide/expressions/casting', 'dfnum', [])}} -```python exec="on" result="text" session="user-guide/cast" ---8<-- "python/user-guide/expressions/casting.py:setup" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:dfnum" ``` -To perform casting operations between floats and integers, or vice versa, we can invoke the `cast()` function. +To perform casting operations between floats and integers, or vice versa, we use the function `cast`: {{code_block('user-guide/expressions/casting','castnum',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:castnum" ``` -Note that in the case of decimal values these are rounded downwards when casting to an integer. +Note that floating point numbers are truncated when casting to an integer data type. -##### Downcast +## Downcasting numerical data types -Reducing the memory footprint is also achievable by modifying the number of bits allocated to an element. As an illustration, the code below demonstrates how casting from `Int64` to `Int16` and from `Float64` to `Float32` can be used to lower memory usage. +You can reduce the memory footprint of a column by changing the precision associated with its numeric data type. +As an illustration, the code below demonstrates how casting from `Int64` to `Int16` and from `Float64` to `Float32` can be used to lower memory usage: -{{code_block('user-guide/expressions/casting','downcast',['cast'])}} +{{code_block('user-guide/expressions/casting','downcast',['cast', 'estimated_size'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:downcast" ``` -#### Overflow - -When performing downcasting, it is crucial to ensure that the chosen number of bits (such as 64, 32, or 16) is sufficient to accommodate the largest and smallest numbers in the column. For example, using a 32-bit signed integer (`Int32`) allows handling integers within the range of -2147483648 to +2147483647, while using `Int8` covers integers between -128 to 127. Attempting to cast to a `DataType` that is too small will result in a `ComputeError` thrown by Polars, as the operation is not supported. +When performing downcasting it is crucial to ensure that the chosen number of bits (such as 64, 32, or 16) is sufficient to accommodate the largest and smallest numbers in the column. +For example, a 32-bit signed integer (`Int32`) represents integers between -2147483648 and 2147483647, inclusive, while an 8-bit signed integer only represents integers between -128 and 127, inclusive. +Attempting to downcast to a data type with insufficient precision results in an error thrown by Polars: {{code_block('user-guide/expressions/casting','overflow',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:overflow" ``` -You can set the `strict` parameter to `False`, this converts values that are overflowing to null values. +If you set the parameter `strict` to `False` the overflowing/underflowing values are converted to `null`: {{code_block('user-guide/expressions/casting','overflow2',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:overflow2" ``` -## Strings +## Converting strings to numeric data types -Strings can be casted to numerical data types and vice versa: +Strings that represent numbers can be converted to the appropriate data types via casting. +The opposite conversion is also possible: {{code_block('user-guide/expressions/casting','strings',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:strings" ``` -In case the column contains a non-numerical value, Polars will throw a `ComputeError` detailing the conversion error. Setting `strict=False` will convert the non float value to `null`. +In case the column contains a non-numerical value, or a poorly formatted one, Polars will throw an error with details on the conversion error. +You can set `strict=False` to circumvent the error and get a `null` value instead. {{code_block('user-guide/expressions/casting','strings2',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:strings2" ``` ## Booleans -Booleans can be expressed as either 1 (`True`) or 0 (`False`). It's possible to perform casting operations between a numerical `DataType` and a boolean, and vice versa. However, keep in mind that casting from a string (`String`) to a boolean is not permitted. +Booleans can be expressed as either 1 (`True`) or 0 (`False`). +It's possible to perform casting operations between a numerical data type and a Boolean, and vice versa. + +When converting numbers to Booleans, the number 0 is converted to `False` and all other numbers are converted to `True`, in alignment with Python's Truthy and Falsy values for numbers: {{code_block('user-guide/expressions/casting','bool',['cast'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:bool" ``` -## Dates +## Parsing / formatting temporal data types -Temporal data types such as `Date` or `Datetime` are represented as the number of days (`Date`) and microseconds (`Datetime`) since epoch. Therefore, casting between the numerical types and the temporal data types is allowed. +All temporal data types are represented internally as the number of time units elapsed since a reference moment, usually referred to as the epoch. +For example, values of the data type `Date` are stored as the number of days since the epoch. +For the data type `Datetime` the time unit is the microsecond (us) and for `Time` the time unit is the nanosecond (ns). -{{code_block('user-guide/expressions/casting','dates',['cast'])}} +Casting between numerical types and temporal data types is allowed and exposes this relationship: -```python exec="on" result="text" session="user-guide/cast" +{{code_block('user-guide/expressions/casting','dates',['cast', 'date_range', 'datetime_range'])}} + +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:dates" ``` -To convert between strings and `Dates`/`Datetimes`, `dt.to_string` and `str.to_datetime` are utilized. Polars adopts the [chrono format syntax](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) for formatting. It's worth noting that `str.to_datetime` features additional options that support timezone functionality. Refer to the API documentation for further information. +To format temporal data types as strings we can use the function `dt.to_string` and to parse temporal data types from strings we can use the function `str.to_datetime`. +Both functions adopt the [chrono format syntax](https://docs.rs/chrono/latest/chrono/format/strftime/index.html) for formatting. {{code_block('user-guide/expressions/casting','dates2',['dt.to_string','str.to_date'])}} -```python exec="on" result="text" session="user-guide/cast" +```python exec="on" result="text" session="user-guide/casting" --8<-- "python/user-guide/expressions/casting.py:dates2" ``` + +It's worth noting that `str.to_datetime` features additional options that support timezone functionality. +Refer to the API documentation for further information. diff --git a/docs/source/user-guide/expressions/categorical-data-and-enums.md b/docs/source/user-guide/expressions/categorical-data-and-enums.md index 9d1e5eee8905..2acdfe1f0316 100644 --- a/docs/source/user-guide/expressions/categorical-data-and-enums.md +++ b/docs/source/user-guide/expressions/categorical-data-and-enums.md @@ -1,11 +1,180 @@ # Categorical data and enums -Categorical data represents string data where the values in the column have a finite set of values (usually way smaller than the length of the column). You can think about columns on gender, countries, currency pairings, etc. Storing these values as plain strings is a waste of memory and performance as we will be repeating the same string over and over again. Additionally, in the case of joins we are stuck with expensive string comparisons. +A column that holds string values that can only take on one of a limited number of possible values is a column that holds [categorical data](https://en.wikipedia.org/wiki/Categorical_variable). +Usually, the number of possible values is much smaller than the length of the column. +Some typical examples include your nationality, the operating system of your computer, or the license that your favorite open source project uses. -That is why Polars supports encoding string values in dictionary format. Working with categorical data in Polars can be done with two different DataTypes: `Enum`,`Categorical`. Both have their own use cases which we will explain further on this page. -First we will look at what a categorical is in Polars. +When working with categorical data you can use Polars' dedicated types, `Categorical` and `Enum`, to make your queries more performant. +Now, we will see what are the differences between the two data types `Categorical` and `Enum` and when you should use when data type or the other. +We also include some notes on [why the data types `Categorical` and `Enum` are more efficient than using the plain string values](#performance-considerations-on-categorical-data-types) in the end of this user guide section. -In Polars a categorical is defined as a string column which is encoded by a dictionary. A string column would be split into two elements: encodings and the actual string values. +## `Enum` vs `Categorical` + +In short, you should prefer `Enum` over `Categorical` whenever possible. +When the categories are fixed and known up front, use `Enum`. +When you don't know the categories or they are not fixed then you must use `Categorical`. +In case your requirements change along the way you can always cast from one to the other. + +{{code_block('user-guide/concepts/data-types/categoricals','example',[])}} + +From the code block above you can see that the data type `Enum` requires the categories to be provided upfront, while the data type `Categorical` infers the categories at runtime. + +## Data type `Enum` + +### Creating an `Enum` + +The data type `Enum` is an ordered categorical data type. +To use the data type `Enum` you have to specify the categories in advance to create a new data type that is a variant of an `Enum`. +Then, when creating a new series, a new dataframe, or when casting a string column, you can use that `Enum` variant. + +{{code_block('user-guide/expressions/categoricals', 'enum-example', ['Enum'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:enum-example" +``` + +### Invalid values + +Polars will raise an error if you try to specify a data type `Enum` whose categories do not include all the values present: + +{{code_block('user-guide/expressions/categoricals', 'enum-wrong-value', ['Enum'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:enum-wrong-value" +``` + +If you are in a position where you cannot know all of the possible values in advance and erroring on unknown values is semantically wrong, you may need to [use the data type `Categorical`](#data-type-categorical). + +### Category ordering and comparison + +The data type `Enum` is ordered and the order is induced by the order in which you specify the categories. +The example below uses log levels as an example of where an ordered `Enum` is useful: + +{{code_block('user-guide/expressions/categoricals', 'log-levels', ['Enum'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:log-levels" +``` + +This example shows that we can compare `Enum` values with a string, but this only works if the string matches one of the `Enum` values. +If we compared the column “level” with any string other than `"debug"`, `"info"`, `"warning"`, or `"error"`, Polars would raise an exception. + +Columns with the data type `Enum` can also be compared with other columns that have the same data type `Enum` or columns that hold strings, but only if all the strings are valid `Enum` values. + +## Data type `Categorical` + +The data type `Categorical` can be seen as a more flexible version of `Enum`. + +### Creating a `Categorical` series + +To use the data type `Categorical`, you can cast a column of strings or specify `Categorical` as the data type of a series or dataframe column: + +{{code_block('user-guide/expressions/categoricals', 'categorical-example', ['Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:categorical-example" +``` + +Having Polars infer the categories for you may sound strictly better than listing the categories beforehand, but this inference comes with a performance cost. +That is why, whenever possible, you should use `Enum`. +You can learn more by [reading the subsection about the data type `Categorical` and its encodings](#data-type-categorical-and-encodings). + +### Lexical comparison with strings + +When comparing a `Categorical` column with a string, Polars will perform a lexical comparison: + +{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-string', ['Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:categorical-comparison-string" +``` + +You can also compare a column of strings with your `Categorical` column, and the comparison will also be lexical: + +{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-string-column', ['Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:categorical-comparison-string-column" +``` + +Although it is possible to compare a string column with a categorical column, it is typically more efficient to compare two categorical columns. +We will see how to do that next. + +### Comparing `Categorical` columns and the string cache + +You are told that comparing columns with the data type `Categorical` is more efficient than if one of them is a string column. +So, you change your code so that the second column is also a categorical column and then you perform your comparison... +But Polars raises an exception: + +{{code_block('user-guide/expressions/categoricals', 'categorical-comparison-categorical-column', ['Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:categorical-comparison-categorical-column" +``` + +By default, the values in columns with the data type `Categorical` are [encoded in the order they are seen in the column](#encodings), and independently from other columns, which means that Polars cannot compare efficiently two categorical columns that were created independently. + +Enabling the Polars string cache and creating the columns with the cache enabled fixes this issue: + +{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-equality', ['StringCache', 'Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:stringcache-categorical-equality" +``` + +Note that using [the string cache comes at a performance cost](#using-the-global-string-cache). + +### Combining `Categorical` columns + +The string cache is also useful in any operation that combines or mixes two columns with the data type `Categorical` in any way. +An example of this is when [concatenating two dataframes vertically](../getting-started.md#concatenating-dataframes): + +{{code_block('user-guide/expressions/categoricals', 'concatenating-categoricals', ['StringCache', 'Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:concatenating-categoricals" +``` + +In this case, Polars issues a warning complaining about an expensive reenconding that implies taking a performance hit. +Polars then suggests using the data type `Enum` if possible, or using the string cache. +To understand the issue with this operation and why Polars raises an error, please read the final section about [the performance considerations of using categorical data types](#performance-considerations-on-categorical-data-types). + +### Comparison between `Categorical` columns is not lexical + +When comparing two columns with data type `Categorical`, Polars does not perform lexical comparison between the values by default. +If you want lexical ordering, you need to specify so when creating the column: + +{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-comparison-lexical', ['StringCache', 'Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:stringcache-categorical-comparison-lexical" +``` + +Otherwise, the order is inferred together with the values: + +{{code_block('user-guide/expressions/categoricals', 'stringcache-categorical-comparison-physical', ['StringCache', 'Categorical'])}} + +```python exec="on" result="text" session="expressions/categoricals" +--8<-- "python/user-guide/expressions/categoricals.py:stringcache-categorical-comparison-physical" +``` + +## Performance considerations on categorical data types + +This part of the user guide explains + +- why categorical data types are more performant than the string literals; and +- why Polars needs a string cache when doing some operations with the data type `Categorical`. + +### Encodings + +Categorical data represents string data where the values in the column have a finite set of values (usually way smaller than the length of the column). +Storing these values as plain strings is a waste of memory and performance as we will be repeating the same string over and over again. +Additionally, in operations like joins we have to perform expensive string comparisons. + +Categorical data types like `Enum` and `Categorical` let you encode the string values in a cheaper way, establishing a relationship between a cheap encoding value and the original string literal. + +As an example of a sensible encoding, Polars could choose to represent the finite set of categories as positive integers. +With that in mind, the diagram below shows a regular string column and a possible representation of a Polars column with the categorical data type: @@ -18,25 +187,25 @@ In Polars a categorical is defined as a string column which is encoded by a dict - + - + - + - + - + - + - +
String Column Categorical Column
Polar BearPolar
Panda BearPanda
Brown BearBrown
Panda BearPanda
Brown BearBrown
Brown BearBrown
Polar BearPolar
@@ -87,13 +256,13 @@ In Polars a categorical is defined as a string column which is encoded by a dict - Polar Bear + Polar - Panda Bear + Panda - Brown Bear + Brown @@ -104,25 +273,26 @@ In Polars a categorical is defined as a string column which is encoded by a dict -The physical `0` in this case encodes (or maps) to the value 'Polar Bear', the value `1` encodes to 'Panda Bear' and the value `2` to 'Brown Bear'. This encoding has the benefit of only storing the string values once. Additionally, when we perform operations (e.g. sorting, counting) we can work directly on the physical representation which is much faster than the working with string data. +The physical `0` in this case encodes (or maps) to the value 'Polar', the value `1` encodes to 'Panda', and the value `2` to 'Brown'. +This encoding has the benefit of only storing the string values once. +Additionally, when we perform operations (e.g. sorting, counting) we can work directly on the physical representation which is much faster than the working with string data. -## `Enum` vs `Categorical` - -Polars supports two different DataTypes for working with categorical data: `Enum` and `Categorical`. When the categories are known up front use `Enum`. When you don't know the categories or they are not fixed then you use `Categorical`. In case your requirements change along the way you can always cast from one to the other. - -{{code_block('user-guide/concepts/data-types/categoricals','example',[])}} +### Encodings for the data type `Enum` are global -From the code block above you can see that the `Enum` data type requires the upfront while the categorical data type infers the categories. +When working with the data type `Enum` we specify the categories in advance. +This way, Polars can ensure different columns and even different datasets have the same encoding and there is no need for expensive re-encoding or cache lookups. -### `Categorical` data type +### Data type `Categorical` and encodings -The `Categorical` data type is a flexible one. Polars will add categories on the fly if it sees them. This sounds like a strictly better version compared to the `Enum` data type as we can simply infer the categories, however inferring comes at a cost. The main cost here is we have no control over our encodings. +The fact that the categories for the data type `Categorical` are inferred come at a cost. +The main cost here is that we have no control over our encodings. -Consider the following scenario where we append the following two categorical `Series` +Consider the following scenario where we append the following two categorical series: {{code_block('user-guide/concepts/data-types/categoricals','append',[])}} -Polars encodes the string values in order as they appear. So the series would look like this: +Polars encodes the string values in the order they appear. +So, the series would look like this: @@ -238,94 +408,102 @@ Polars encodes the string values in order as they appear. So the series would lo
cat_series cat2_series
-Combining the `Series` becomes a non-trivial task which is expensive as the physical value of `0` represents something different in both `Series`. Polars does support these types of operations for convenience, however in general these should be avoided due to its slower performance as it requires making both encodings compatible first before doing any merge operations. - -#### Using the global string cache - -One way to handle this problem is to enable a `StringCache`. When you enable the `StringCache` strings are no longer encoded in the order they appear on a per-column basis. Instead, the string cache ensures a single encoding for each string. The string `Polar` will always map the same physical for all categorical columns made under the string cache. -Merge operations (e.g. appends, joins) are cheap as there is no need to make the encodings compatible first, solving the problem we had above. - -{{code_block('user-guide/concepts/data-types/categoricals','global_append',[])}} - -However, the string cache does come at a small performance hit during construction of the `Series` as we need to look up / insert the string value in the cache. Therefore, it is preferred to use the `Enum` Data Type if you know your categories in advance. - -### `Enum data type` - -In the `Enum` data type we specify the categories in advance. This way we ensure categoricals from different columns or different datasets have the same encoding and there is no need for expensive re-encoding or cache lookups. - -{{code_block('user-guide/concepts/data-types/categoricals','enum_append',[])}} - - - -Polars will raise an `OutOfBounds` error when a value is encountered which is not specified in the `Enum`. - -{{code_block('user-guide/concepts/data-types/categoricals','enum_error',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:setup" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:enum_error" -``` - -## Comparisons - - - -The following types of comparisons operators are allowed for categorical data: +Combining the series becomes a non-trivial task which is expensive as the physical value of `0` represents something different in both series. +Polars does support these types of operations for convenience, however these should be avoided due to its slower performance as it requires making both encodings compatible first before doing any merge operations. -- Categorical vs Categorical -- Categorical vs String +### Using the global string cache -### `Categorical` Type +One way to handle this reencoding problem is to enable the string cache. +Under the string cache, the diagram would instead look like this: -For the `Categorical` type comparisons are valid if they have the same global cache set or if they have the same underlying categories in the same order. - -{{code_block('user-guide/concepts/data-types/categoricals','global_equality',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:setup" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:global_equality" -``` - -For `Categorical` vs `String` comparisons Polars uses lexical ordering to determine the result: - -{{code_block('user-guide/concepts/data-types/categoricals','str_compare_single',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:str_compare_single" -``` - -{{code_block('user-guide/concepts/data-types/categoricals','str_compare',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:str_compare" -``` - -### `Enum` Type - -For `Enum` type comparisons are valid if they have the same categories. - -{{code_block('user-guide/concepts/data-types/categoricals','equality',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:equality" -``` - -For `Enum` vs `String` comparisons the order within the categories is used instead of lexical ordering. In order for a comparison to be valid all values in the `String` column should be present in the `Enum` categories list. - -{{code_block('user-guide/concepts/data-types/categoricals','str_enum_compare_error',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:str_enum_compare_error" -``` - -{{code_block('user-guide/concepts/data-types/categoricals','str_enum_compare_single',[])}} - -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:str_enum_compare_single" -``` + + + + + +
SeriesString cache
+ + + + + + +
cat_seriescat2_series
+ + + + + + + + + + + + + + + + + + + + + + + +
Physical
0
1
2
2
0
+
+ + + + + + + + + + + + + + + + + + + + + + + +
Physical
1
2
2
0
0
+
+
+ + + + + + + + + + + + + + + + + +
Categories
Polar
Panda
Brown
+
-{{code_block('user-guide/concepts/data-types/categoricals','str_enum_compare',[])}} +When you enable the string cache, strings are no longer encoded in the order they appear on a per-column basis. +Instead, the encoding is shared across columns. +The value 'Polar' will always be encoded by the same value for all categorical columns created under the string cache. +Merge operations (e.g. appends, joins) become cheap again as there is no need to make the encodings compatible first, solving the problem we had above. -```python exec="on" result="text" session="user-guide/datatypes/categoricals" ---8<-- "python/user-guide/concepts/data-types/categoricals.py:str_enum_compare" -``` +However, the string cache does come at a small performance hit during construction of the series as we need to look up or insert the string values in the cache. +Therefore, it is preferred to use the data type `Enum` if you know your categories in advance. diff --git a/docs/source/user-guide/expressions/column-selections.md b/docs/source/user-guide/expressions/column-selections.md deleted file mode 100644 index 92a87dc2b760..000000000000 --- a/docs/source/user-guide/expressions/column-selections.md +++ /dev/null @@ -1,134 +0,0 @@ -# Column selections - -Let's create a dataset to use in this section: - -{{code_block('user-guide/expressions/column-selections','selectors_df',['DataFrame'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:setup" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_df" -``` - -## Expression expansion - -As we've seen in the previous section, we can select specific columns using the `pl.col` method. It can also select multiple columns - both as a means of convenience, and to _expand_ the expression. - -This kind of convenience feature isn't just decorative or syntactic sugar. It allows for a very powerful application of [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principles in your code: a single expression that specifies multiple columns expands into a list of expressions (depending on the DataFrame schema), resulting in being able to select multiple columns + run computation on them! - -### Select all, or all but some - -We can select all columns in the `DataFrame` object by providing the argument `*`: - -{{code_block('user-guide/expressions/column-selections', 'all',['all'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:all" -``` - -Often, we don't just want to include all columns, but include all _while_ excluding a few. This can be done easily as well: - -{{code_block('user-guide/expressions/column-selections','exclude',['exclude'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:exclude" -``` - -### By multiple strings - -Specifying multiple strings allows expressions to _expand_ to all matching columns: - -{{code_block('user-guide/expressions/column-selections','expansion_by_names',['dt.to_string'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:expansion_by_names" -``` - -### By regular expressions - -Multiple column selection is possible by regular expressions also, by making sure to wrap the regex by `^` and `$` to let `pl.col` know that a regex selection is expected: - -{{code_block('user-guide/expressions/column-selections','expansion_by_regex',[])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:expansion_by_regex" -``` - -### By data type - -`pl.col` can select multiple columns using Polars data types: - -{{code_block('user-guide/expressions/column-selections','expansion_by_dtype',['n_unique'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:expansion_by_dtype" -``` - -## Using `selectors` - -Polars also allows for the use of intuitive selections for columns based on their name, `dtype` or other properties; and this is built on top of existing functionality outlined in `col` used above. It is recommended to use them by importing and aliasing `polars.selectors` as `cs`. - -### By `dtype` - -To select just the integer and string columns, we can do: - -{{code_block('user-guide/expressions/column-selections','selectors_intro',['selectors'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_intro" -``` - -### Applying set operations - -These _selectors_ also allow for set based selection operations. For instance, to select the **numeric** columns **except** the **first** column that indicates row numbers: - -{{code_block('user-guide/expressions/column-selections','selectors_diff',['cs.first', 'cs.numeric'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_diff" -``` - -We can also select the row number by name **and** any **non**-numeric columns: - -{{code_block('user-guide/expressions/column-selections','selectors_union',['cs.by_name', 'cs.numeric'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_union" -``` - -### By patterns and substrings - -_Selectors_ can also be matched by substring and regex patterns: - -{{code_block('user-guide/expressions/column-selections','selectors_by_name',['cs.contains', 'cs.matches'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_by_name" -``` - -### Converting to expressions - -What if we want to apply a specific operation on the selected columns (i.e. get back to representing them as **expressions** to operate upon)? We can simply convert them using `as_expr` and then proceed as normal: - -{{code_block('user-guide/expressions/column-selections','selectors_to_expr',['cs.temporal'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_to_expr" -``` - -### Debugging `selectors` - -Polars also provides two helpful utility functions to aid with using selectors: `is_selector` and `expand_selector`: - -{{code_block('user-guide/expressions/column-selections','selectors_is_selector_utility',['is_selector'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_is_selector_utility" -``` - -To predetermine the column names that are selected, which is especially useful for a LazyFrame object: - -{{code_block('user-guide/expressions/column-selections','selectors_colnames_utility',['expand_selector'])}} - -```python exec="on" result="text" session="user-guide/column-selections" ---8<-- "python/user-guide/expressions/column-selections.py:selectors_colnames_utility" -``` diff --git a/docs/source/user-guide/expressions/expression-expansion.md b/docs/source/user-guide/expressions/expression-expansion.md new file mode 100644 index 000000000000..450a1d0b078d --- /dev/null +++ b/docs/source/user-guide/expressions/expression-expansion.md @@ -0,0 +1,356 @@ +# Expression expansion + +As you've seen in [the section about expressions and contexts](../concepts/expressions-and-contexts.md), expression expansion is a feature that enables you to write a single expression that can expand to multiple different expressions, possibly depending on the schema of the context in which the expression is used. + +This feature isn't just decorative or syntactic sugar. +It allows for a very powerful application of [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) principles in your code: +a single expression that specifies multiple columns expands into a list of expressions, which means you can write one single expression and reuse the computation that it represents. + +In this section we will show several forms of expression expansion and we will be using the dataframe that you can see below for that effect: + +{{code_block('user-guide/expressions/expression-expansion', 'df', [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:df" +``` + +## Function `col` + +The function `col` is the most common way of making use of expression expansion features in Polars. +Typically used to refer to one column of a dataframe, in this section we explore other ways in which you can use `col`. + +### Explicit expansion by column name + +The simplest form of expression expansion happens when you provide multiple column names to the function `col`. + +The example below uses a single function `col` with multiple column names to convert the values in USD to EUR: + +{{code_block('user-guide/expressions/expression-expansion', 'col-with-names', ['col'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-with-names" +``` + +When you list the column names you want the expression to expand to, you can predict what the expression will expand to. +In this case, the expression that does the currency conversion is expanded to a list of five expressions: + +{{code_block('user-guide/expressions/expression-expansion', 'expression-list', ['col'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:expression-list" +``` + +### Expansion by data type + +We had to type five column names in the previous example but the function `col` can also conveniently accept one or more data types. +If you provide data types instead of column names, the expression is expanded to all columns that match one of the data types provided. + +The example below performs the exact same computation as before: + +{{code_block('user-guide/expressions/expression-expansion', 'col-with-dtype', ['col'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-with-dtype" +``` + +When we use a data type with expression expansion we cannot know, beforehand, how many columns a single expression will expand to. +We need the schema of the input dataframe if we want to determine what is the final list of expressions that is to be applied. + +If we weren't sure about whether the price columns where of the type `Float64` or `Float32`, we could specify both data types in the function `col`: + +{{code_block('user-guide/expressions/expression-expansion', 'col-with-dtypes', ['col'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-with-dtypes" +``` + +### Expansion by pattern matching + +You can also use regular expressions to specify patterns that are used to match the column names. +To distinguish between a regular column name and expansion by pattern matching, regular expressions start and end with `^` and `$`, respectively. +This also means that the pattern must match against the whole column name string. + +Regular expressions can be mixed with regular column names: + +{{code_block('user-guide/expressions/expression-expansion', 'col-with-regex', ['col'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-with-regex" +``` + +### Arguments cannot be of mixed types + +The function `col` accepts an arbitrary number of strings (as [column names](#explicit-expansion-by-column-name) or as [regular expressions](#expansion-by-pattern-matching)) or an arbitrary number of data types, but you cannot mix both in the same function call: + +{{code_block('user-guide/expressions/expression-expansion', 'col-error', ['col'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-error" +``` + +## Selecting all columns + +Polars provides the function `all` as shorthand notation to refer to all columns of a dataframe: + +{{code_block('user-guide/expressions/expression-expansion', 'all', ['all'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:all" +``` + +!!! note +The function `all` is syntactic sugar for `pl.col("*")`, but since the argument `"*"` is a special case and `pl.all` reads more like English, the usage of `pl.all` is preferred. + +## Excluding columns + +Polars also provides a mechanism to exclude certain columns from expression expansion. +For that, you use the function `exclude`, which accepts exactly the same types of arguments as `col`: + +{{code_block('user-guide/expressions/expression-expansion', 'all-exclude', ['exclude'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:all-exclude" +``` + +Naturally, the function `exclude` can also be used after the function `col`: + +{{code_block('user-guide/expressions/expression-expansion', 'col-exclude', ['exclude'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:col-exclude" +``` + +## Column renaming + +By default, when you apply an expression to a column, the result keeps the same name as the original column. + +Preserving the column name can be semantically wrong and in certain cases Polars may even raise an error if duplicate names occur: + +{{code_block('user-guide/expressions/expression-expansion', 'duplicate-error', [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:duplicate-error" +``` + +To prevent errors like this, and to allow users to rename their columns when appropriate, Polars provides a series of functions that let you change the name of a column or a group of columns. + +### Renaming a single column with `alias` + +The function `alias` has been used thoroughly in the documentation already and it lets you rename a single column: + +{{code_block('user-guide/expressions/expression-expansion', 'alias', ['alias'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:alias" +``` + +### Prefixing and suffixing column names + +When using expression expansion you cannot use the function `alias` because the function `alias` is designed specifically to rename a single column. + +When it suffices to add a static prefix or a static suffix to the existing names, we can use the functions `prefix` and `suffix` from the namespace `name`: + +{{code_block('user-guide/expressions/expression-expansion', 'prefix-suffix', ['Expr.name', 'prefix', 'suffix'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:prefix-suffix" +``` + +### Dynamic name replacement + +If a static prefix/suffix is not enough, the namespace `name` also provides the function `map` that accepts a callable that accepts the old column names and produces the new ones: + +{{code_block('user-guide/expressions/expression-expansion', 'name-map', ['Expr.name', 'map'])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:name-map" +``` + +See the API reference for the full contents of the namespace `name`. + +## Programmatically generating expressions + +Expression expansion is a very useful feature but it does not solve all of your problems. +For example, if we want to compute the day and year amplitude of the prices of the stocks in our dataframe, expression expansion won't help us. + +At first, you may think about using a `for` loop: + +{{code_block('user-guide/expressions/expression-expansion', 'for-with_columns', [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:for-with_columns" +``` + +Do not do this. +Instead, generate all of the expressions you want to compute programmatically and use them only once in a context. +Loosely speaking, you want to swap the `for` loop with the context `with_columns`. +In practice, you could do something like the following: + +{{code_block('user-guide/expressions/expression-expansion', 'yield-expressions', [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:yield-expressions" +``` + +This produces the same final result and by specifying all of the expressions in one go we give Polars the opportunity to: + +1. do a better job at optimising the query; and +2. parallelise the execution of the actual computations. + +## More flexible column selections + +Polars comes with the submodule `selectors` that provides a number of functions that allow you to write more flexible column selections for expression expansion. + +!!! warning +This functionality is not available in Rust yet. Refer to [Polars issue #10594](https://github.com/pola-rs/polars/issues/10594). + +As a first example, here is how we can use the functions `string` and `ends_with`, and the set operations that the functions from `selectors` support, to select all string columns and the columns whose names end with `"_high"`: + +{{code_block('user-guide/expressions/expression-expansion', 'selectors', [], ['selectors'], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:selectors" +``` + +The submodule `selectors` provides [a number of selectors that match based on the data type of the columns](#selectors-for-data-types), of which the most useful are the functions that match a whole category of types, like `cs.numeric` for all numeric data types or `cs.temporal` for all temporal data types. + +The submodule `selectors` also provides [a number of selectors that match based on patterns in the column names](#selectors-for-column-name-patterns) which make it more convenient to specify common patterns you may want to check for, like the function `cs.ends_with` that was shown above. + +### Combining selectors with set operations + +We can combine multiple selectors using set operations and the usual Python operators: + +| Operator | Operation | +| -------- | -------------------- | +| `A | B` | +| `A & B` | Intersection | +| `A - B` | Difference | +| `A ^ B` | Symmetric difference | +| `~A` | Complement | + +The next example matches all non-string columns that contain an underscore in the name: + +{{code_block('user-guide/expressions/expression-expansion', 'selectors-set-operations', [], ['selectors'], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:selectors-set-operations" +``` + +### Resolving operator ambiguity + +Expression functions can be chained on top of selectors: + +{{code_block('user-guide/expressions/expression-expansion', 'selectors-expressions', [], ['selectors'], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:selectors-expressions" +``` + +However, some operators have been overloaded to operate both on Polars selectors and on expressions. +For example, the operator `~` on a selector represents [the set operation “complement”](#combining-selectors-with-set-operations) and on an expression represents the Boolean operation of negation. + +When you use a selector and then want to use, in the context of an expression, one of the [operators that act as set operators for selectors](#combining-selectors-with-set-operations), you can use the function `as_expr`. + +Below, we want to negate the Boolean values in the columns “has_partner”, “has_kids”, and “has_tattoos”. +If we are not careful, the combination of the operator `~` and the selector `cs.starts_with("has_")` will actually select the columns that we do not care about: + +{{code_block('user-guide/expressions/expression-expansion', 'selector-ambiguity', [], [], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:selector-ambiguity" +``` + +The correct solution uses `as_expr`: + +{{code_block('user-guide/expressions/expression-expansion', 'as_expr', [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:as_expr" +``` + +### Debugging selectors + +When you are not sure whether you have a Polars selector at hand or not, you can use the function `cs.is_selector` to check: + +{{code_block('user-guide/expressions/expression-expansion', 'is_selector', [], ['is_selector'], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:is_selector" +``` + +This should help you avoid any ambiguous situations where you think you are operating with expressions but are in fact operating with selectors. + +Another helpful debugging utility is the function `expand_selector`. +Given a target frame or schema, you can check what columns a given selector will expand to: + +{{code_block('user-guide/expressions/expression-expansion', 'expand_selector', [], ['expand_selector'], [])}} + +```python exec="on" result="text" session="expressions/expression-expansion" +--8<-- "python/user-guide/expressions/expression-expansion.py:expand_selector" +``` + +### Complete reference + +The tables below group the functions available in the submodule `selectors` by their type of behaviour. + +#### Selectors for data types + +Selectors that match based on the data type of the column: + +| Selector function | Data type(s) matched | +| ------------------ | ------------------------------------------------------------------ | +| `binary` | `Binary` | +| `boolean` | `Boolean` | +| `by_dtype` | Data types specified as arguments | +| `categorical` | `Categorical` | +| `date` | `Date` | +| `datetime` | `Datetime`, optionally filtering by time unit/zone | +| `decimal` | `Decimal` | +| `duration` | `Duration`, optionally filtering by time unit | +| `float` | All float types, regardless of precision | +| `integer` | All integer types, signed and unsigned, regardless of precision | +| `numeric` | All numeric types, namely integers, floats, and `Decimal` | +| `signed_integer` | All signed integer types, regardless of precision | +| `string` | `String` | +| `temporal` | All temporal data types, namely `Date`, `Datetime`, and `Duration` | +| `time` | `Time` | +| `unsigned_integer` | All unsigned integer types, regardless of precision | + +#### Selectors for column name patterns + +Selectors that match based on column name patterns: + +| Selector function | Columns selected | +| ----------------- | ------------------------------------------------------------ | +| `alpha` | Columns with alphabetical names | +| `alphanumeric` | Columns with alphanumeric names (letters and the digits 0-9) | +| `by_name` | Columns with the names specified as arguments | +| `contains` | Columns whose names contain the substring specified | +| `digit` | Columns with numeric names (only the digits 0-9) | +| `ends_with` | Columns whose names end with the given substring | +| `matches` | Columns whose names match the given regex pattern | +| `starts_with` | Columns whose names start with the given substring | + +#### Positional selectors + +Selectors that match based on the position of the columns: + +| Selector function | Columns selected | +| ----------------- | ------------------------------------ | +| `all` | All columns | +| `by_index` | The columns at the specified indices | +| `first` | The first column in the context | +| `last` | The last column in the context | + +#### Miscellaneous functions + +The submodule `selectors` also provides the following functions: + +| Function | Behaviour | +| ----------------- | ------------------------------------------------------------------------------------- | +| `as_expr`* | Convert a selector to an expression | +| `exclude` | Selects all columns except those matching the given names, data types, or selectors | +| `expand_selector` | Expand selector to matching columns with respect to a specific frame or target schema | +| `is_selector` | Check whether the given object/expression is a selector | + +*`as_expr` isn't a function defined on the submodule `selectors`, but rather a method defined on selectors. diff --git a/docs/source/user-guide/expressions/folds.md b/docs/source/user-guide/expressions/folds.md index 7990aae7eca8..6fb8d56072c4 100644 --- a/docs/source/user-guide/expressions/folds.md +++ b/docs/source/user-guide/expressions/folds.md @@ -1,26 +1,61 @@ # Folds -Polars provides expressions/methods for horizontal aggregations like `sum`,`min`, `mean`, -etc. However, when you need a more complex aggregation the default methods Polars supplies may not be sufficient. That's when `folds` come in handy. +Polars provides many expressions to perform computations across columns, like `sum_horizontal`, `mean_horizontal`, and `min_horizontal`. +However, these are just special cases of a general algorithm called a fold, and Polars provides a general mechanism for you to compute custom folds for when the specialised versions of Polars are not enough. -The `fold` expression operates on columns for maximum speed. It utilizes the data layout very efficiently and often has vectorized execution. +Folds computed with the function `fold` operate on the full columns for maximum speed. +They utilize the data layout very efficiently and often have vectorized execution. -### Manual sum +## Basic example -Let's start with an example by implementing the `sum` operation ourselves, with a `fold`. +As a first example, we will reimplement `sum_horizontal` with the function `fold`: {{code_block('user-guide/expressions/folds','mansum',['fold'])}} ```python exec="on" result="text" session="user-guide/folds" ---8<-- "python/user-guide/expressions/folds.py:setup" --8<-- "python/user-guide/expressions/folds.py:mansum" ``` -The snippet above recursively applies the function `f(acc, x) -> acc` to an accumulator `acc` and a new column `x`. The function operates on columns individually and can take advantage of cache efficiency and vectorization. +The function `fold` expects a function `f` as the parameter `function` and `f` should accept two arguments. +The first argument is the accumulated result, which we initialise as zero, and the second argument takes the successive values of the expressions listed in the parameter `exprs`. +In our case, they're the two columns “a” and “b”. -### Conditional +The snippet below includes a third explicit expression that represents what the function `fold` is doing above: -In the case where you'd want to apply a condition/predicate on all columns in a `DataFrame` a `fold` operation can be a very concise way to express this. +{{code_block('user-guide/expressions/folds','mansum-explicit',['fold'])}} + +```python exec="on" result="text" session="user-guide/folds" +--8<-- "python/user-guide/expressions/folds.py:mansum-explicit" +``` + +??? tip "`fold` in Python" + + Most programming languages include a higher-order function that implements the algorithm that the function `fold` in Polars implements. + The Polars `fold` is very similar to Python's `functools.reduce`. + You can [learn more about the power of `functools.reduce` in this article](http://mathspp.com/blog/pydonts/the-power-of-reduce). + +## The initial value `acc` + +The initial value chosen for the accumulator `acc` is typically, but not always, the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation you want to apply. +For example, if we wanted to multiply across the columns, we would not get the correct result if our accumulator was set to zero: + +{{code_block('user-guide/expressions/folds','manprod',['fold'])}} + +```python exec="on" result="text" session="user-guide/folds" +--8<-- "python/user-guide/expressions/folds.py:manprod" +``` + +To fix this, the accumulator `acc` should be set to `1`: + +{{code_block('user-guide/expressions/folds','manprod-fixed',['fold'])}} + +```python exec="on" result="text" session="user-guide/folds" +--8<-- "python/user-guide/expressions/folds.py:manprod-fixed" +``` + +## Conditional + +In the case where you'd want to apply a condition/predicate across all columns in a dataframe, a fold can be a very concise way to express this. {{code_block('user-guide/expressions/folds','conditional',['fold'])}} @@ -28,13 +63,14 @@ In the case where you'd want to apply a condition/predicate on all columns in a --8<-- "python/user-guide/expressions/folds.py:conditional" ``` -In the snippet we filter all rows where **each** column value is `> 1`. +The snippet above filters all rows where all columns are greater than 1. -### Folds and string data +## Folds and string data -Folds could be used to concatenate string data. However, due to the materialization of intermediate columns, this operation will have squared complexity. +Folds could be used to concatenate string data. +However, due to the materialization of intermediate columns, this operation will have squared complexity. -Therefore, we recommend using the `concat_str` expression for this. +Therefore, we recommend using the function `concat_str` for this: {{code_block('user-guide/expressions/folds','string',['concat_str'])}} diff --git a/docs/source/user-guide/expressions/functions.md b/docs/source/user-guide/expressions/functions.md deleted file mode 100644 index 21c17ea4758b..000000000000 --- a/docs/source/user-guide/expressions/functions.md +++ /dev/null @@ -1,65 +0,0 @@ -# Functions - -Polars expressions have a large number of built in functions. These allow you to create complex queries without the need for [user defined functions](user-defined-functions.md). There are too many to go through here, but we will cover some of the more popular use cases. If you want to view all the functions go to the API Reference for your programming language. - -In the examples below we will use the following `DataFrame`: - -{{code_block('user-guide/expressions/functions','dataframe',['DataFrame'])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:setup" ---8<-- "python/user-guide/expressions/functions.py:dataframe" -``` - -## Column naming - -By default if you perform an expression it will keep the same name as the original column. In the example below we perform an expression on the `nrs` column. Note that the output `DataFrame` still has the same name. - -{{code_block('user-guide/expressions/functions','samename',[])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:samename" -``` - -This might get problematic in the case you use the same column multiple times in your expression as the output columns will get duplicated. For example, the following query will fail. - -{{code_block('user-guide/expressions/functions','samenametwice',[])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:samenametwice" -``` - -You can change the output name of an expression by using the `alias` function - -{{code_block('user-guide/expressions/functions','samenamealias',['alias'])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:samenamealias" -``` - -In case of multiple columns for example when using `all()` or `col(*)` you can apply a mapping function `name.map` to change the original column name into something else. In case you want to add a suffix (`name.suffix()`) or prefix (`name.prefix()`) these are also built in. - -=== ":fontawesome-brands-python: Python" -[:material-api: `name.prefix`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.prefix.html) -[:material-api: `name.suffix`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.suffix.html) -[:material-api: `name.map`](https://docs.pola.rs/api/python/stable/reference/expressions/api/polars.Expr.name.map.html) - -## Count unique values - -There are two ways to count unique values in Polars: an exact methodology and an approximation. The approximation uses the [HyperLogLog++](https://en.wikipedia.org/wiki/HyperLogLog) algorithm to approximate the cardinality and is especially useful for very large datasets where an approximation is good enough. - -{{code_block('user-guide/expressions/functions','countunique',['n_unique','approx_n_unique'])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:countunique" -``` - -## Conditionals - -Polars supports if-else like conditions in expressions with the `when`, `then`, `otherwise` syntax. The predicate is placed in the `when` clause and when this evaluates to `true` the `then` expression is applied otherwise the `otherwise` expression is applied (row-wise). - -{{code_block('user-guide/expressions/functions','conditional',['when'])}} - -```python exec="on" result="text" session="user-guide/functions" ---8<-- "python/user-guide/expressions/functions.py:conditional" -``` diff --git a/docs/source/user-guide/expressions/index.md b/docs/source/user-guide/expressions/index.md index 32550974782e..9e753c45a6d4 100644 --- a/docs/source/user-guide/expressions/index.md +++ b/docs/source/user-guide/expressions/index.md @@ -1,18 +1,20 @@ # Expressions -In the `Contexts` sections we outlined what `Expressions` are and how they are invaluable. In this section we will focus on the `Expressions` themselves. Each section gives an overview of what they do and provide additional examples. +We [introduced the concept of “expressions” in a previous section](../concepts/expressions-and-contexts.md#expressions). +In this section we will focus on exploring the types of expressions that Polars offers. +Each section gives an overview of what they do and provides additional examples. -- [Operators](operators.md) -- [Column selections](column-selections.md) -- [Functions](functions.md) -- [Casting](casting.md) -- [Strings](strings.md) -- [Aggregation](aggregation.md) -- [Missing data](missing-data.md) -- [Window](window.md) -- [Folds](folds.md) -- [Lists](lists.md) -- [Plugins](plugins.md) -- [User-defined functions](user-defined-functions.md) -- [Structs](structs.md) -- [Numpy](numpy.md) +- [Basic operations](basic-operations.md) – how to do basic operations on dataframe columns, like arithmetic calculations, comparisons, and other common, general-purpose operations +- [Expression expansion](expression-expansion.md) – what is expression expansion and how to use it +- [Casting](casting.md) – how to convert / cast values to different data types +- [Strings](strings.md) – how to work with strings and the namespace `str` +- [Lists and arrays](lists-and-arrays.md) – the differences between the data types `List` and `Array`, when to use them, and how to use them +- [Categorical data and enums](categorical-data-and-enums.md) – the differences between the data types `Categorical` and `Enum`, when to use them, and how to use them +- [Structs](structs.md) – when to use the data type `Struct` and how to use it +- [Missing data](missing-data.md) – how to work with missing data and how to fill missing data +- [Aggregation](aggregation.md) – how to work with aggregating contexts like `group_by` +- [Window functions](window-functions.md) – how to apply window functions over columns in a dataframe +- [Folds](folds.md) – how to perform arbitrary computations horizontally across columns +- [Plugins for custom expressions](plugins-for-custom-expressions.md) – how to create your own custom expressions via Rust plugins +- [User-defined Python functions](user-defined-python-functions.md) – how to apply user-defined Python functions to dataframe columns or to column values +- [Numpy functions](numpy-functions.md) – how to use NumPy native functions on Polars dataframes and series diff --git a/docs/source/user-guide/expressions/lists-and-arrays.md b/docs/source/user-guide/expressions/lists-and-arrays.md new file mode 100644 index 000000000000..98e599c58bfa --- /dev/null +++ b/docs/source/user-guide/expressions/lists-and-arrays.md @@ -0,0 +1,182 @@ +# Lists and arrays + +Polars has first-class support for two homogeneous container data types: `List` and `Array`. +Polars supports many operations with the two data types and their APIs overlap, so this section of the user guide has the objective of clarifying when one data type should be chosen in favour of the other. + +## Lists vs arrays + +### The data type `List` + +The data type list is suitable for columns whose values are homogeneous 1D containers of varying lengths. + +The dataframe below contains three examples of columns with the data type `List`: + +{{code_block('user-guide/expressions/lists', 'list-example', ['List'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:list-example" +``` + +Note that the data type `List` is different from Python's type `list`, where elements can be of any type. +If you want to store true Python lists in a column, you can do so with the data type `Object` and your column will not have the list manipulation features that we're about to discuss. + +### The data type `Array` + +The data type `Array` is suitable for columns whose values are homogeneous containers of an arbitrary dimension with a known and fixed shape. + +The dataframe below contains two examples of columns with the data type `Array`. + +{{code_block('user-guide/expressions/lists', 'array-example', ['Array'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:array-example" +``` + +The example above shows how to use the parameter `schema_overrides` to specify that the columns “bit_flags” and “tic_tac_toe” have the data type `Array`, parametrised by the data type of the elements contained within and by the shape of each array. + +In general, Polars does not infer that a column has the data type `Array` for performance reasons, and defaults to the appropriate variant of the data type `List`. +In Python, an exception to this rule is when you provide a NumPy array to build a column. +In that case, Polars has the guarantee from NumPy that all subarrays have the same shape, so an array of $n + 1$ dimensions will generate a column of $n$ dimensional arrays: + +{{code_block('user-guide/expressions/lists', 'numpy-array-inference', ['Array'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:numpy-array-inference" +``` + +### When to use each + +In short, prefer the data type `Array` over `List` because it is more memory efficient and more performant. +If you cannot use `Array`, then use `List`: + +- when the values within a column do not have a fixed shape; or +- when you need functions that are only available in the list API. + +## Working with lists + +### The namespace `list` + +Polars provides many functions to work with values of the data type `List` and these are grouped inside the namespace `list`. +We will explore this namespace a bit now. + +!!! warning "`arr` then, `list` now" +In previous versions of Polars, the namespace for list operations used to be `arr`. +`arr` is now the namespace for the data type `Array`. +If you find references to the namespace `arr` on StackOverflow or other sources, note that those sources _may_ be outdated. + +The dataframe `weather` defined below contains data from different weather stations across a region. +When the weather station is unable to get a result, an error code is recorded instead of the actual temperature at that time. + +{{code_block('user-guide/expressions/lists', 'weather', [])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:weather" +``` + +### Programmatically creating lists + +Given the dataframe `weather` defined previously, it is very likely we need to run some analysis on the temperatures that are captured by each station. +To make this happen, we need to first be able to get individual temperature measurements. +We [can use the namespace `str`](strings.md#the-string-namespace) for this: + +{{code_block('user-guide/expressions/lists', 'split', ['split'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:split" +``` + +A natural follow-up would be to explode the list of temperatures so that each measurement is in its own row: + +{{code_block('user-guide/expressions/lists', 'explode', ['explode'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:explode" +``` + +However, in Polars we often do not need to do this to operate on the list elements. + +### Operating on lists + +Polars provides several standard operations on columns with the `List` data type. +[Similar to what you can do with strings](strings.md#slicing), lists can be sliced with the functions `head`, `tail`, and `slice`: + +{{code_block('user-guide/expressions/lists', 'list-slicing', ['Expr.list'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:list-slicing" +``` + +### Element-wise computation within lists + +If we need to identify the stations that are giving the most number of errors we need to + +1. try to convert the measurements into numbers; +2. count the number of non-numeric values (i.e., `null` values) in the list, by row; and +3. rename this output column as “errors” so that we can easily identify the stations. + +To perform these steps, we need to perform a casting operation on each measurement within the list values. +The function `eval` is used as the entry point to perform operations on the elements of the list. +Within it, you can use the context `element` to refer to each single element of the list individually, and then you can use any Polars expression on the element: + +{{code_block('user-guide/expressions/lists', 'element-wise-casting', ['element'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:element-wise-casting" +``` + +Another alternative would be to use a regular expression to check if a measurement starts with a letter: + +{{code_block('user-guide/expressions/lists', 'element-wise-regex', ['element'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:element-wise-regex" +``` + +If you are unfamiliar with the namespace `str` or the notation `(?i)` in the regex, now is a good time to [look at how to work with strings and regular expressions in Polars](strings.md#check-for-the-existence-of-a-pattern). + +### Row-wise computations + +The function `eval` gives us access to the list elements and `pl.element` refers to each individual element, but we can also use `pl.all()` to refer to all of the elements of the list. + +To show this in action, we will start by creating another dataframe with some more weather data: + +{{code_block('user-guide/expressions/lists', 'weather_by_day', [])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:weather_by_day" +``` + +Now, we will calculate the percentage rank of the temperatures by day, measured across stations. +Polars does not provide a function to do this directly, but because expressions are so versatile we can create our own percentage rank expression for highest temperature. +Let's try that: + +{{code_block('user-guide/expressions/lists', 'rank_pct', ['element', 'rank'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:rank_pct" +``` + +## Working with arrays + +### Creating an array column + +As [we have seen above](#the-data-type-array), Polars usually does not infer the data type `Array` automatically. +You have to specify the data type `Array` when creating a series/dataframe or [cast a column](casting.md) explicitly unless you create the column out of a NumPy array. + +### The namespace `arr` + +The data type `Array` was recently introduced and is still pretty nascent in features that it offers. +Even so, the namespace `arr` aggregates several functions that you can use to work with arrays. + +!!! warning "`arr` then, `list` now" +In previous versions of Polars, the namespace for list operations used to be `arr`. +`arr` is now the namespace for the data type `Array`. +If you find references to the namespace `arr` on StackOverflow or other sources, note that those sources _may_ be outdated. + +The API documentation should give you a good overview of the functions in the namespace `arr`, of which we present a couple: + +{{code_block('user-guide/expressions/lists', 'array-overview', ['Expr.arr'])}} + +```python exec="on" result="text" session="expressions/lists" +--8<-- "python/user-guide/expressions/lists.py:array-overview" +``` diff --git a/docs/source/user-guide/expressions/lists.md b/docs/source/user-guide/expressions/lists.md deleted file mode 100644 index dea95ffc2c1c..000000000000 --- a/docs/source/user-guide/expressions/lists.md +++ /dev/null @@ -1,119 +0,0 @@ -# Lists and Arrays - -Polars has first-class support for `List` columns: that is, columns where each row is a list of homogeneous elements, of varying lengths. Polars also has an `Array` datatype, which is analogous to NumPy's `ndarray` objects, where the length is identical across rows. - -Note: this is different from Python's `list` object, where the elements can be of any type. Polars can store these within columns, but as a generic `Object` datatype that doesn't have the special list manipulation features that we're about to discuss. - -## Powerful `List` manipulation - -Let's say we had the following data from different weather stations across a state. When the weather station is unable to get a result, an error code is recorded instead of the actual temperature at that time. - -{{code_block('user-guide/expressions/lists','weather_df',['DataFrame'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:setup" ---8<-- "python/user-guide/expressions/lists.py:weather_df" -``` - -### Creating a `List` column - -For the `weather` `DataFrame` created above, it's very likely we need to run some analysis on the temperatures that are captured by each station. To make this happen, we need to first be able to get individual temperature measurements. This is done by: - -{{code_block('user-guide/expressions/lists','string_to_list',['str.split'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:string_to_list" -``` - -One way we could go post this would be to convert each temperature measurement into its own row: - -{{code_block('user-guide/expressions/lists','explode_to_atomic',['DataFrame.explode'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:explode_to_atomic" -``` - -However, in Polars, we often do not need to do this to operate on the `List` elements. - -### Operating on `List` columns - -Polars provides several standard operations on `List` columns. If we want the first three measurements, we can do a `head(3)`. The last three can be obtained via a `tail(3)`, or alternately, via `slice` (negative indexing is supported). We can also identify the number of observations via `lengths`. Let's see them in action: - -{{code_block('user-guide/expressions/lists','list_ops',['Expr.list'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:list_ops" -``` - -!!! warning "`arr` then, `list` now" - - If you find references to the `arr` API on Stackoverflow or other sources, just replace `arr` with `list`, this was the old accessor for the `List` datatype. `arr` now refers to the newly introduced `Array` datatype (see below). - -### Element-wise computation within `List`s - -If we need to identify the stations that are giving the most number of errors from the starting `DataFrame`, we need to: - -1. Parse the string input as a `List` of string values (already done). -2. Identify those strings that can be converted to numbers. -3. Identify the number of non-numeric values (i.e. `null` values) in the list, by row. -4. Rename this output as `errors` so that we can easily identify the stations. - -The third step requires a casting (or alternately, a regex pattern search) operation to be perform on each element of the list. We can do this using by applying the operation on each element by first referencing them in the `pl.element()` context, and then calling a suitable Polars expression on them. Let's see how: - -{{code_block('user-guide/expressions/lists','count_errors',['Expr.list', 'element'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:count_errors" -``` - -What if we chose the regex route (i.e. recognizing the presence of _any_ alphabetical character?) - -{{code_block('user-guide/expressions/lists','count_errors_regex',['str.contains'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:count_errors_regex" -``` - -If you're unfamiliar with the `(?i)`, it's a good time to look at the documentation for the `str.contains` function in Polars! The Rust regex crate provides a lot of additional regex flags that might come in handy. - -## Row-wise computations - -This context is ideal for computing in row orientation. - -We can apply **any** Polars operations on the elements of the list with the `list.eval` (`list().eval` in Rust) expression! These expressions run entirely on Polars' query engine and can run in parallel, so will be well optimized. Let's say we have another set of weather data across three days, for different stations: - -{{code_block('user-guide/expressions/lists','weather_by_day',['DataFrame'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:weather_by_day" -``` - -Let's do something interesting, where we calculate the percentage rank of the temperatures by day, measured across stations. Pandas allows you to compute the percentages of the `rank` values. Polars doesn't provide a special function to do this directly, but because expressions are so versatile we can create our own percentage rank expression for highest temperature. Let's try that! - -{{code_block('user-guide/expressions/lists','weather_by_day_rank',['list.eval'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:weather_by_day_rank" -``` - -## Polars `Array`s - -`Array`s are a new data type that was recently introduced, and are still pretty nascent in features that it offers. The major difference between a `List` and an `Array` is that the latter is limited to having the same number of elements per row, while a `List` can have a variable number of elements. Both still require that each element's data type is the same. - -We can define `Array` columns in this manner: - -{{code_block('user-guide/expressions/lists','array_df',['Array'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:array_df" -``` - -Basic operations are available on it: - -{{code_block('user-guide/expressions/lists','array_ops',['Series.arr'])}} - -```python exec="on" result="text" session="user-guide/lists" ---8<-- "python/user-guide/expressions/lists.py:array_ops" -``` - -Polars `Array`s are still being actively developed, so this section will likely change in the future. diff --git a/docs/source/user-guide/expressions/missing-data.md b/docs/source/user-guide/expressions/missing-data.md index ce2fd0216c5f..f1697cced489 100644 --- a/docs/source/user-guide/expressions/missing-data.md +++ b/docs/source/user-guide/expressions/missing-data.md @@ -1,31 +1,35 @@ # Missing data -This page sets out how missing data is represented in Polars and how missing data can be filled. +This section of the user guide teaches how to work with missing data in Polars. ## `null` and `NaN` values -Each column in a `DataFrame` (or equivalently a `Series`) is an Arrow array or a collection of Arrow arrays [based on the Apache Arrow spec](https://arrow.apache.org/docs/format/Columnar.html#null-count). Missing data is represented in Arrow and Polars with a `null` value. This `null` missing value applies for all data types including numerical values. +In Polars, missing data is represented by the value `null`. +This missing value `null` is used for all data types, including numerical types. -Polars also allows `NotaNumber` or `NaN` values for float columns. These `NaN` values are considered to be a type of floating point data rather than missing data. We discuss `NaN` values separately below. +Polars also supports the value `NaN` (“Not a Number”) for columns with floating point numbers. +The value `NaN` is considered to be a valid floating point value, which is different from missing data. +[We discuss the value `NaN` separately below](#not-a-number-or-nan-values). -You can manually define a missing value with the python `None` value: +When creating a series or a dataframe, you can set a value to `null` by using the appropriate construct for your language: {{code_block('user-guide/expressions/missing-data','dataframe',['DataFrame'])}} ```python exec="on" result="text" session="user-guide/missing-data" ---8<-- "python/user-guide/expressions/missing-data.py:setup" --8<-- "python/user-guide/expressions/missing-data.py:dataframe" ``` -!!! info +!!! info "Difference from pandas" - In pandas the value for missing data depends on the dtype of the column. In Polars missing data is always represented as a `null` value. + In pandas, the value used to represent missing data depends on the data type of the column. + In Polars, missing data is always represented by the value `null`. ## Missing data metadata -Each Arrow array used by Polars stores two kinds of metadata related to missing data. This metadata allows Polars to quickly show how many missing values there are and which values are missing. +Polars keeps track of some metadata regarding the missing data of each series. +This metadata allows Polars to answer some basic queries about missing values in a very efficient way, namely how many values are missing and which ones are missing. -The first piece of metadata is the `null_count` - this is the number of rows with `null` values in the column: +To determine how many values are missing from a column you can use the function `null_count`: {{code_block('user-guide/expressions/missing-data','count',['null_count'])}} @@ -33,12 +37,13 @@ The first piece of metadata is the `null_count` - this is the number of rows wit --8<-- "python/user-guide/expressions/missing-data.py:count" ``` -The `null_count` method can be called on a `DataFrame`, a column from a `DataFrame` or a `Series`. The `null_count` method is a cheap operation as `null_count` is already calculated for the underlying Arrow array. - -The second piece of metadata is an array called a _validity bitmap_ that indicates whether each data value is valid or missing. -The validity bitmap is memory efficient as it is bit encoded - each value is either a 0 or a 1. This bit encoding means the memory overhead per array is only (array length / 8) bytes. The validity bitmap is used by the `is_null` method in Polars. +The function `null_count` can be called on a dataframe, a column from a dataframe, or on a series directly. +The function `null_count` is a cheap operation because the result is already known. -You can return a `Series` based on the validity bitmap for a column in a `DataFrame` or a `Series` with the `is_null` method: +Polars uses something called a “validity bitmap” to know which values are missing in a series. +The validity bitmap is memory efficient as it is bit encoded. +If a series has length $n$, then its validity bitmap will cost $n / 8$ bytes. +The function `is_null` uses the validity bitmap to efficiently report which values are `null` and which are not: {{code_block('user-guide/expressions/missing-data','isnull',['is_null'])}} @@ -46,18 +51,27 @@ You can return a `Series` based on the validity bitmap for a column in a `DataFr --8<-- "python/user-guide/expressions/missing-data.py:isnull" ``` -The `is_null` method is a cheap operation that does not require scanning the full column for `null` values. This is because the validity bitmap already exists and can be returned as a Boolean array. +The function `is_null` can be used on a column of a dataframe or on a series directly. +Again, this is a cheap operation because the result is already known by Polars. + +??? info "Why does Polars waste memory on a validity bitmap?" + + It all comes down to a tradeoff. + By using a bit more memory per column, Polars can be much more efficient when performing most operations on your columns. + If the validity bitmap wasn't known, every time you wanted to compute something you would have to check each position of the series to see if a legal value was present or not. + With the validity bitmap, Polars knows automatically the positions where your operations can be applied. ## Filling missing data -Missing data in a `Series` can be filled with the `fill_null` method. You have to specify how you want the `fill_null` method to fill the missing data. The main ways to do this are filling with: +Missing data in a series can be filled with the function `fill_null`. +You can specify how missing data is effectively filled in a couple of different ways: -- a literal such as 0 or "0" -- a strategy such as filling forwards -- an expression such as replacing with values from another column -- interpolation +- a literal of the correct data type; +- a Polars expression, such as replacing with values computed from another column; +- a strategy based on neighbouring values, such as filling forwards or backwards; and +- interpolation. -We illustrate each way to fill nulls by defining a simple `DataFrame` with a missing value in `col2`: +To illustrate how each of these methods work we start by defining a simple dataframe with two missing values in the second column: {{code_block('user-guide/expressions/missing-data','dataframe2',['DataFrame'])}} @@ -65,9 +79,10 @@ We illustrate each way to fill nulls by defining a simple `DataFrame` with a mis --8<-- "python/user-guide/expressions/missing-data.py:dataframe2" ``` -### Fill with specified literal value +### Fill with a specified literal value -We can fill the missing data with a specified literal value with `pl.lit`: +You can fill the missing data with a specified literal value. +This literal value will replace all of the occurrences of the value `null`: {{code_block('user-guide/expressions/missing-data','fill',['fill_null'])}} @@ -75,34 +90,35 @@ We can fill the missing data with a specified literal value with `pl.lit`: --8<-- "python/user-guide/expressions/missing-data.py:fill" ``` -### Fill with a strategy +However, this is actually just a special case of the general case where [the function `fill_null` replaces missing values with the corresponding values from the result of a Polars expression](#fill-with-a-strategy-based-on-neighbouring-values), as seen next. -We can fill the missing data with a strategy such as filling forward: +### Fill with an expression -{{code_block('user-guide/expressions/missing-data','fillstrategy',['fill_null'])}} +In the general case, the missing data can be filled by extracting the corresponding values from the result of a general Polars expression. +For example, we can fill the second column with values taken from the double of the first column: + +{{code_block('user-guide/expressions/missing-data','fillexpr',['fill_null'])}} ```python exec="on" result="text" session="user-guide/missing-data" ---8<-- "python/user-guide/expressions/missing-data.py:fillstrategy" +--8<-- "python/user-guide/expressions/missing-data.py:fillexpr" ``` -You can find other fill strategies in the API docs. +### Fill with a strategy based on neighbouring values -### Fill with an expression +You can also fill the missing data by following a fill strategy based on the neighbouring values. +The two simpler strategies look for the first non-`null` value that comes immediately before or immediately after the value `null` that is being filled: -For more flexibility we can fill the missing data with an expression. For example, -to fill nulls with the median value from that column: - -{{code_block('user-guide/expressions/missing-data','fillexpr',['fill_null'])}} +{{code_block('user-guide/expressions/missing-data','fillstrategy',['fill_null'])}} ```python exec="on" result="text" session="user-guide/missing-data" ---8<-- "python/user-guide/expressions/missing-data.py:fillexpr" +--8<-- "python/user-guide/expressions/missing-data.py:fillstrategy" ``` -In this case the column is cast from integer to float because the median is a float statistic. +You can find other fill strategies in the API docs. ### Fill with interpolation -In addition, we can fill nulls with interpolation (without using the `fill_null` function): +Additionally, you can fill missing data with interpolation by using the function `interpolate` instead of the function `fill_null`: {{code_block('user-guide/expressions/missing-data','fillinterpolate',['interpolate'])}} @@ -110,9 +126,11 @@ In addition, we can fill nulls with interpolation (without using the `fill_null` --8<-- "python/user-guide/expressions/missing-data.py:fillinterpolate" ``` -## `NotaNumber` or `NaN` values +## Not a Number, or `NaN` values -Missing data in a `Series` has a `null` value. However, you can use `NotaNumber` or `NaN` values in columns with float datatypes. These `NaN` values can be created from Numpy's `np.nan` or the native python `float('nan')`: +Missing data in a series is represented by the value `null`, regardless of the data type of the series. +However, in columns that have a floating point data type, the value `NaN` can be used. +These values can be created directly: {{code_block('user-guide/expressions/missing-data','nan',['DataFrame'])}} @@ -120,18 +138,30 @@ Missing data in a `Series` has a `null` value. However, you can use `NotaNumber` --8<-- "python/user-guide/expressions/missing-data.py:nan" ``` +The special value `NaN` might also arise as the result of a computation: + +{{code_block('user-guide/expressions/missing-data','nan-computed',[])}} + +```python exec="on" result="text" session="user-guide/missing-data" +--8<-- "python/user-guide/expressions/missing-data.py:nan-computed" +``` + !!! info - In pandas by default a `NaN` value in an integer column causes the column to be cast to float. This does not happen in Polars - instead an exception is raised. + By default, a `NaN` value in an integer column causes the column to be cast to a float data type in pandas. + This does not happen in Polars; instead, an exception is raised. -`NaN` values are considered to be a type of floating point data and are **not considered to be missing data** in Polars. This means: +`NaN` values are considered to be a type of floating point data and are **not considered to be missing data** in Polars. +This means: -- `NaN` values are **not** counted with the `null_count` method -- `NaN` values are filled when you use `fill_nan` method but are **not** filled with the `fill_null` method +- `NaN` values are **not** counted with the function `null_count`; and +- `NaN` values are filled when you use the specialised function `fill_nan` method but are **not** filled with the function `fill_null`. -Polars has `is_nan` and `fill_nan` methods which work in a similar way to the `is_null` and `fill_null` methods. The underlying Arrow arrays do not have a pre-computed validity bitmask for `NaN` values so this has to be computed for the `is_nan` method. +Polars has the functions `is_nan` and `fill_nan`, which work in a similar way to the functions `is_null` and `fill_null`. +Unlike with missing data, Polars does not hold any metadata regarding the `NaN` values, so the function `is_nan` entails actual computation. -One further difference between `null` and `NaN` values is that taking the `mean` of a column with `null` values excludes the `null` values from the calculation but with `NaN` values taking the mean results in a `NaN`. This behaviour can be avoided by replacing the `NaN` values with `null` values; +One further difference between the values `null` and `NaN` is that numerical aggregating functions, like `mean` and `sum`, skip the missing values when computing the result, whereas the value `NaN` is considered for the computation and typically propagates into the result. +If desirable, this behavior can be avoided by replacing the occurrences of the value `NaN` with the value `null`: {{code_block('user-guide/expressions/missing-data','nanfill',['fill_nan'])}} diff --git a/docs/source/user-guide/expressions/numpy-functions.md b/docs/source/user-guide/expressions/numpy-functions.md new file mode 100644 index 000000000000..b140d5ff458e --- /dev/null +++ b/docs/source/user-guide/expressions/numpy-functions.md @@ -0,0 +1,24 @@ +# Numpy functions + +Polars expressions support NumPy [ufuncs](https://numpy.org/doc/stable/reference/ufuncs.html). +See [the NumPy documentation for a list of all supported NumPy functions](https://numpy.org/doc/stable/reference/ufuncs.html#available-ufuncs). + +This means that if a function is not provided by Polars, we can use NumPy and we still have fast columnar operations through the NumPy API. + +## Example + +{{code_block('user-guide/expressions/numpy-example',api_functions=['DataFrame','np.log'])}} + +```python exec="on" result="text" session="user-guide/numpy" +--8<-- "python/user-guide/expressions/numpy-example.py" +``` + +## Interoperability + +Polars' series have support for NumPy universal functions (ufuncs) and generalized ufuncs. +Element-wise functions such as `np.exp`, `np.cos`, `np.div`, etc, all work with almost zero overhead. + +However, bear in mind that [Polars keeps track of missing values with a separate bitmask](missing-data.md) and NumPy does not receive this information. +This can lead to a window function or a `np.convolve` giving flawed or incomplete results, so an error will be raised if you pass a series with missing data to a generalized ufunc. +Convert a Polars series to a NumPy array with the function `to_numpy`. +Missing values will be replaced by `np.nan` during the conversion. diff --git a/docs/source/user-guide/expressions/numpy.md b/docs/source/user-guide/expressions/numpy.md deleted file mode 100644 index 4a5a46978b57..000000000000 --- a/docs/source/user-guide/expressions/numpy.md +++ /dev/null @@ -1,22 +0,0 @@ -# Numpy - -Polars expressions support NumPy [ufuncs](https://numpy.org/doc/stable/reference/ufuncs.html). See [here](https://numpy.org/doc/stable/reference/ufuncs.html#available-ufuncs) -for a list on all supported numpy functions. - -This means that if a function is not provided by Polars, we can use NumPy and we still have fast columnar operation through the NumPy API. - -### Example - -{{code_block('user-guide/expressions/numpy-example',api_functions=['DataFrame','np.log'])}} - -```python exec="on" result="text" session="user-guide/numpy" ---8<-- "python/user-guide/expressions/numpy-example.py" -``` - -### Interoperability - -Polars `Series` have support for NumPy universal functions (ufuncs) and generalized ufuncs. Element-wise functions such as `np.exp()`, `np.cos()`, `np.div()`, etc. all work with almost zero overhead. - -However, as a Polars-specific remark: missing values are a separate bitmask and are not visible by NumPy. This can lead to a window function or a `np.convolve()` giving flawed or incomplete results, so an error will be raised if you pass a `Series` with missing data to a generalized ufunc. - -Convert a Polars `Series` to a NumPy array with the `.to_numpy()` method. Missing values will be replaced by `np.nan` during the conversion. diff --git a/docs/source/user-guide/expressions/operators.md b/docs/source/user-guide/expressions/operators.md deleted file mode 100644 index 24cb4e6834b8..000000000000 --- a/docs/source/user-guide/expressions/operators.md +++ /dev/null @@ -1,30 +0,0 @@ -# Basic operators - -This section describes how to use basic operators (e.g. addition, subtraction) in conjunction with Expressions. We will provide various examples using different themes in the context of the following dataframe. - -!!! note Operator Overloading - - In Rust and Python it is possible to use the operators directly (as in `+ - * / < > `) as the language allows operator overloading. For instance, the operator `+` translates to the `.add()` method. You can choose the one you prefer. - -{{code_block('user-guide/expressions/operators','dataframe',['DataFrame'])}} - -```python exec="on" result="text" session="user-guide/operators" ---8<-- "python/user-guide/expressions/operators.py:setup" ---8<-- "python/user-guide/expressions/operators.py:dataframe" -``` - -### Numerical - -{{code_block('user-guide/expressions/operators','numerical',['operators'])}} - -```python exec="on" result="text" session="user-guide/operators" ---8<-- "python/user-guide/expressions/operators.py:numerical" -``` - -### Logical - -{{code_block('user-guide/expressions/operators','logical',['operators'])}} - -```python exec="on" result="text" session="user-guide/operators" ---8<-- "python/user-guide/expressions/operators.py:logical" -``` diff --git a/docs/source/user-guide/expressions/plugins.md b/docs/source/user-guide/expressions/plugins-for-custom-expressions.md similarity index 98% rename from docs/source/user-guide/expressions/plugins.md rename to docs/source/user-guide/expressions/plugins-for-custom-expressions.md index 9ef5633cfcd0..8da90f5f5446 100644 --- a/docs/source/user-guide/expressions/plugins.md +++ b/docs/source/user-guide/expressions/plugins-for-custom-expressions.md @@ -1,4 +1,6 @@ -# Expression plugins +# Plugins for custom expressions + + Expression plugins are the preferred way to create user defined functions. They allow you to compile a Rust function and register that as an expression into the Polars library. The Polars engine will dynamically link your function at runtime diff --git a/docs/source/user-guide/expressions/speed_rank_by_type.svg b/docs/source/user-guide/expressions/speed_rank_by_type.svg new file mode 100644 index 000000000000..aa1eeaa9736a --- /dev/null +++ b/docs/source/user-guide/expressions/speed_rank_by_type.svg @@ -0,0 +1,102 @@ + + + + + + Bulbasaur + + Ivysaur + + Venusaur + + + VenusaurMega + Venusaur + + + Charmander + + ... + + Oddish + + Gloom + + ... + + Grass + + Grass + + Grass + + Grass + + Fire + + ... + + Grass + + Grass + + ... + + 45 + + 60 + + 80 + + 80 + + 65 + + ... + + 30 + + 40 + + ... + + + + + + 6 + + 3 + + 1 + + 1 + + 7 + + ... + + 8 + + 7 + + ... + + Name + + Type 1 + + Speed + + Speed rank + + Goldbat + + Poison + + 90 + + 1 + diff --git a/docs/source/user-guide/expressions/strings.md b/docs/source/user-guide/expressions/strings.md index b7aefcf0ba75..ab8bea96109a 100644 --- a/docs/source/user-guide/expressions/strings.md +++ b/docs/source/user-guide/expressions/strings.md @@ -1,62 +1,134 @@ # Strings -The following section discusses operations performed on `String` data, which is a frequently used `DataType` when working with `DataFrames`. However, processing strings can often be inefficient due to their unpredictable memory size, causing the CPU to access many random memory locations. To address this issue, Polars utilizes Arrow as its backend, which stores all strings in a contiguous block of memory. As a result, string traversal is cache-optimal and predictable for the CPU. +The following section discusses operations performed on string data, which is a frequently used data type when working with dataframes. +String processing functions are available in the namespace `str`. -String processing functions are available in the `str` namespace. +Working with strings in other dataframe libraries can be highly inefficient due to the fact that strings have unpredictable lengths. +Polars mitigates these inefficiencies by [following the Arrow Columnar Format specification](../concepts/data-types-and-structures.md#data-types-internals), so you can write performant data queries on string data too. -##### Accessing the string namespace +## The string namespace -The `str` namespace can be accessed through the `.str` attribute of a column with `String` data type. In the following example, we create a column named `animal` and compute the length of each element in the column in terms of the number of bytes and the number of characters. If you are working with ASCII text, then the results of these two computations will be the same, and using `len_bytes` is recommended since it is faster. +When working with string data you will likely need to access the namespace `str`, which aggregates 40+ functions that let you work with strings. +As an example of how to access functions from within that namespace, the snippet below shows how to compute the length of the strings in a column in terms of the number of bytes and the number of characters: {{code_block('user-guide/expressions/strings','df',['str.len_bytes','str.len_chars'])}} -```python exec="on" result="text" session="user-guide/strings" ---8<-- "python/user-guide/expressions/strings.py:setup" +```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:df" ``` -#### String parsing +!!! note +If you are working exclusively with ASCII text, then the results of the two computations will be the same and using `len_bytes` is recommended since it is faster. -Polars offers multiple methods for checking and parsing elements of a string. Firstly, we can use the `contains` method to check whether a given pattern exists within a substring. Subsequently, we can extract these patterns and replace them using other methods, which will be demonstrated in upcoming examples. +## Parsing strings -##### Check for existence of a pattern +Polars offers multiple methods for checking and parsing elements of a string column, namely checking for the existence of given substrings or patterns, and counting, extracting, or replacing, them. +We will demonstrate some of these operations in the upcoming examples. -To check for the presence of a pattern within a string, we can use the contains method. The `contains` method accepts either a regular substring or a regex pattern, depending on the value of the `literal` parameter. If the pattern we're searching for is a simple substring located either at the beginning or end of the string, we can alternatively use the `starts_with` and `ends_with` functions. +### Check for the existence of a pattern + +We can use the function `contains` to check for the presence of a pattern within a string. +By default, the argument to the function `contains` is interpreted as a regular expression. +If you want to specify a literal substring, set the parameter `literal` to `True`. + +For the special cases where you want to check if the strings start or end with a fixed substring, you can use the functions `starts_with` or `ends_with`, respectively. {{code_block('user-guide/expressions/strings','existence',['str.contains', 'str.starts_with','str.ends_with'])}} -```python exec="on" result="text" session="user-guide/strings" +```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:existence" ``` -##### Extract a pattern +### Regex specification + +Polars relies on the Rust crate `regex` to work with regular expressions, so you may need to [refer to the syntax documentation](https://docs.rs/regex/latest/regex/#syntax) to see what features and flags are supported. +In particular, note that the flavor of regex supported by Polars is different from Python's module `re`. + +### Extract a pattern -The `extract` method allows us to extract a pattern from a specified string. This method takes a regex pattern containing one or more capture groups, which are defined by parentheses `()` in the pattern. The group index indicates which capture group to output. +The function `extract` allows us to extract patterns from the string values in a column. +The function `extract` accepts a regex pattern with one or more capture groups and extracts the capture group specified as the second argument. {{code_block('user-guide/expressions/strings','extract',['str.extract'])}} -```python exec="on" result="text" session="user-guide/strings" +```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:extract" ``` -To extract all occurrences of a pattern within a string, we can use the `extract_all` method. In the example below, we extract all numbers from a string using the regex pattern `(\d+)`, which matches one or more digits. The resulting output of the `extract_all` method is a list containing all instances of the matched pattern within the string. +To extract all occurrences of a pattern within a string, we can use the function `extract_all`. +In the example below, we extract all numbers from a string using the regex pattern `(\d+)`, which matches one or more digits. +The resulting output of the function `extract_all` is a list containing all instances of the matched pattern within the string. {{code_block('user-guide/expressions/strings','extract_all',['str.extract_all'])}} -```python exec="on" result="text" session="user-guide/strings" +```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:extract_all" ``` -##### Replace a pattern +### Replace a pattern -We have discussed two methods for pattern matching and extraction thus far, and now we will explore how to replace a pattern within a string. Similar to `extract` and `extract_all`, Polars provides the `replace` and `replace_all` methods for this purpose. In the example below we replace one match of `abc` at the end of a word (`\b`) by `ABC` and we replace all occurrence of `a` with `-`. +Akin to the functions `extract` and `extract_all`, Polars provides the functions `replace` and `replace_all`. +These accept a regex pattern or a literal substring (if the parameter `literal` is set to `True`) and perform the replacements specified. +The function `replace` will make at most one replacement whereas the function `replace_all` will make all the non-overlapping replacements it finds. -{{code_block('user-guide/expressions/strings','replace',['str.replace','str.replace_all'])}} +{{code_block('user-guide/expressions/strings','replace',['str.replace', 'str.replace_all'])}} -```python exec="on" result="text" session="user-guide/strings" +```python exec="on" result="text" session="expressions/strings" --8<-- "python/user-guide/expressions/strings.py:replace" ``` -#### API documentation +## Modifying strings + +### Case conversion + +Converting the casing of a string is a common operation and Polars supports it out of the box with the functions `to_lowercase`, `to_titlecase`, and `to_uppercase`: + +{{code_block('user-guide/expressions/strings','casing', ['str.to_lowercase', 'str.to_titlecase', 'str.to_uppercase'])}} + +```python exec="on" result="text" session="expressions/strings" +--8<-- "python/user-guide/expressions/strings.py:casing" +``` + +### Stripping characters from the ends + +Polars provides five functions in the namespace `str` that let you strip characters from the ends of the string: + +| Function | Behaviour | +| ------------------- | --------------------------------------------------------------------- | +| `strip_chars` | Removes leading and trailing occurrences of the characters specified. | +| `strip_chars_end` | Removes trailing occurrences of the characters specified. | +| `strip_chars_start` | Removes leading occurrences of the characters specified. | +| `strip_prefix` | Removes an exact substring prefix if present. | +| `strip_suffix` | Removes an exact substring suffix if present. | + +??? info "Similarity to Python string methods" +`strip_chars` is similar to Python's string method `strip` and `strip_prefix`/`strip_suffix` are similar to Python's string methods `removeprefix` and `strip_suffix`, respectively. + +It is important to understand that the first three functions interpret their string argument as a set of characters whereas the functions `strip_prefix` and `strip_suffix` do interpret their string argument as a literal string. + +{{code_block('user-guide/expressions/strings', 'strip', ['str.strip_chars', 'str.strip_chars_end', 'str.strip_chars_start', 'str.strip_prefix', 'str.strip_suffix'])}} + +```python exec="on" result="text" session="expressions/strings" +--8<-- "python/user-guide/expressions/strings.py:strip" +``` + +If no argument is provided, the three functions `strip_chars`, `strip_chars_end`, and `strip_chars_start`, remove whitespace by default. + +### Slicing + +Besides [extracting substrings as specified by patterns](#extract-a-pattern), you can also slice strings at specified offsets to produce substrings. +The general-purpose function for slicing is `slice` and it takes the starting offset and the optional _length_ of the slice. +If the length of the slice is not specified or if it's past the end of the string, Polars slices the string all the way to the end. + +The functions `head` and `tail` are specialised versions used for slicing the beginning and end of a string, respectively. + +{{code_block('user-guide/expressions/strings', 'slice', ['str.slice', 'str.head', 'str.tail'])}} + +```python exec="on" result="text" session="expressions/strings" +--8<-- "python/user-guide/expressions/strings.py:slice" +``` + +## API documentation -In addition to the examples covered above, Polars offers various other string manipulation methods for tasks such as formatting, stripping, splitting, and more. To explore these additional methods, you can go to the API documentation of your chosen programming language for Polars. +In addition to the examples covered above, Polars offers various other string manipulation functions. +To explore these additional methods, you can go to the API documentation of your chosen programming language for Polars. diff --git a/docs/source/user-guide/expressions/structs.md b/docs/source/user-guide/expressions/structs.md index d692c05ad0a1..2fb9e5ba20f0 100644 --- a/docs/source/user-guide/expressions/structs.md +++ b/docs/source/user-guide/expressions/structs.md @@ -1,82 +1,112 @@ -# The Struct datatype +# Structs -Polars `Struct`s are the idiomatic way of working with multiple columns. It is also a free operation i.e. moving columns into `Struct`s does not copy any data! +The data type `Struct` is a composite data type that can store multiple fields in a single column. -For this section, let's start with a `DataFrame` that captures the average rating of a few movies across some states in the U.S.: +??? tip "Python analogy" +For Python users, the data type `Struct` is kind of like a Python dictionary. +Even better, if you are familiar with Python typing, you can think of the data type `Struct` as `typing.TypedDict`. + +In this page of the user guide we will see situations in which the data type `Struct` arises, we will understand why it does arise, and we will see how to work with `Struct` values. + +Let's start with a dataframe that captures the average rating of a few movies across some states in the US: {{code_block('user-guide/expressions/structs','ratings_df',['DataFrame'])}} -```python exec="on" result="text" session="user-guide/structs" ---8<-- "python/user-guide/expressions/structs.py:setup" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:ratings_df" ``` -## Encountering the `Struct` type +## Encountering the data type `Struct` -A common operation that will lead to a `Struct` column is the ever so popular `value_counts` function that is commonly used in exploratory data analysis. Checking the number of times a state appears the data will be done as so: +A common operation that will lead to a `Struct` column is the ever so popular `value_counts` function that is commonly used in exploratory data analysis. +Checking the number of times a state appears in the data is done as so: {{code_block('user-guide/expressions/structs','state_value_counts',['value_counts'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:state_value_counts" ``` -Quite unexpected an output, especially if coming from tools that do not have such a data type. We're not in peril though, to get back to a more familiar output, all we need to do is `unnest` the `Struct` column into its constituent columns: +Quite unexpected an output, especially if coming from tools that do not have such a data type. +We're not in peril, though. +To get back to a more familiar output, all we need to do is use the function `unnest` on the `Struct` column: {{code_block('user-guide/expressions/structs','struct_unnest',['unnest'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:struct_unnest" ``` +The function `unnest` will turn each field of the `Struct` into its own column. + !!! note "Why `value_counts` returns a `Struct`" - Polars expressions always have a `Fn(Series) -> Series` signature and `Struct` is thus the data type that allows us to provide multiple columns as input/output of an expression. In other words, all expressions have to return a `Series` object, and `Struct` allows us to stay consistent with that requirement. + Polars expressions always operate on a single series and return another series. + `Struct` is the data type that allows us to provide multiple columns as input to an expression, or to output multiple columns from an expression. + Thus, we can use the data type `Struct` to specify each value and its count when we use `value_counts`. -## Structs as `dict`s +## Inferring the data type `Struct` from dictionaries -Polars will interpret a `dict` sent to the `Series` constructor as a `Struct`: +When building series or dataframes, Polars will convert dictionaries to the data type `Struct`: {{code_block('user-guide/expressions/structs','series_struct',['Series'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:series_struct" ``` -!!! note "Constructing `Series` objects" +The number of fields, their names, and their types, are inferred from the first dictionary seen. +Subsequent incongruences can result in `null` values or in errors: - Note that `Series` here was constructed with the `name` of the series in the beginning, followed by the `values`. Providing the latter first - is considered an anti-pattern in Polars, and must be avoided. +{{code_block('user-guide/expressions/structs','series_struct_error',['Series'])}} + +```python exec="on" result="text" session="expressions/structs" +--8<-- "python/user-guide/expressions/structs.py:series_struct_error" +``` -### Extracting individual values of a `Struct` +## Extracting individual values of a `Struct` -Let's say that we needed to obtain just the `movie` value in the `Series` that we created above. We can use the `field` method to do so: +Let's say that we needed to obtain just the field `"Movie"` from the `Struct` in the series that we created above. +We can use the function `field` to do so: {{code_block('user-guide/expressions/structs','series_struct_extract',['struct.field'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:series_struct_extract" ``` -### Renaming individual keys of a `Struct` +## Renaming individual fields of a `Struct` -What if we need to rename individual `field`s of a `Struct` column? We first convert the `rating_series` object to a `DataFrame` so that we can view the changes easily, and then use the `rename_fields` method: +What if we need to rename individual fields of a `Struct` column? +We use the function `rename_fields`: {{code_block('user-guide/expressions/structs','series_struct_rename',['struct.rename_fields'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:series_struct_rename" ``` +To be able to actually see that the field names were change, we will create a dataframe where the only column is the result and then we use the function `unnest` so that each field becomes its own column. +The column names will reflect the renaming operation we just did: + +{{code_block('user-guide/expressions/structs','struct-rename-check',['struct.rename_fields'])}} + +```python exec="on" result="text" session="expressions/structs" +--8<-- "python/user-guide/expressions/structs.py:struct-rename-check" +``` + ## Practical use-cases of `Struct` columns ### Identifying duplicate rows -Let's get back to the `ratings` data. We want to identify cases where there are duplicates at a `Movie` and `Theatre` level. This is where the `Struct` datatype shines: +Let's get back to the `ratings` data. +We want to identify cases where there are duplicates at a “Movie” and “Theatre” level. + +This is where the data type `Struct` shines: {{code_block('user-guide/expressions/structs','struct_duplicates',['is_duplicated', 'struct'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:struct_duplicates" ``` @@ -84,23 +114,40 @@ We can identify the unique cases at this level also with `is_unique`! ### Multi-column ranking -Suppose, given that we know there are duplicates, we want to choose which rank gets a higher priority. We define `Count` of ratings to be more important than the actual `Avg_Rating` themselves, and only use it to break a tie. We can then do: +Suppose, given that we know there are duplicates, we want to choose which rating gets a higher priority. +We can say that the column “Count” is the most important, and if there is a tie in the column “Count” then we consider the column “Avg_Rating”. + +We can then do: {{code_block('user-guide/expressions/structs','struct_ranking',['is_duplicated', 'struct'])}} -```python exec="on" result="text" session="user-guide/structs" +```python exec="on" result="text" session="expressions/structs" --8<-- "python/user-guide/expressions/structs.py:struct_ranking" ``` That's a pretty complex set of requirements done very elegantly in Polars! +To learn more about the function `over`, used above, [see the user guide section on window functions](window-functions.md). + +### Using multiple columns in a single expression + +As mentioned earlier, the data type `Struct` is also useful if you need to pass multiple columns as input to an expression. +As an example, suppose we want to compute [the Ackermann function](https://en.wikipedia.org/wiki/Ackermann_function) on two columns of a dataframe. +There is no way of composing Polars expressions to compute the Ackermann function[^1], so we define a custom function: -### Using multi-column apply +{{code_block('user-guide/expressions/structs', 'ack', [])}} -This was discussed in the previous section on _User Defined Functions_ for the Python case. -Here's an example of doing so with both Python and Rust: +```python exec="on" result="text" session="expressions/structs" +--8<-- "python/user-guide/expressions/structs.py:ack" +``` + +Now, to compute the values of the Ackermann function on those arguments, we start by creating a `Struct` with fields `m` and `n` and then use the function `map_elements` to apply the function `ack` to each value: -{{code_block('user-guide/expressions/structs','multi_column_apply',[])}} +{{code_block('user-guide/expressions/structs','struct-ack',['map_elements'])}} -```python exec="on" result="text" session="user-guide/structs" ---8<-- "python/user-guide/expressions/structs.py:multi_column_apply" +```python exec="on" result="text" session="expressions/structs" +--8<-- "python/user-guide/expressions/structs.py:struct-ack" ``` + +Refer to [this section of the user guide to learn more about applying user-defined Python functions to your data](user-defined-python-functions.md). + +[^1]: To say that something cannot be done is quite a bold claim. If you prove us wrong, please let us know! diff --git a/docs/source/user-guide/expressions/user-defined-functions.md b/docs/source/user-guide/expressions/user-defined-python-functions.md similarity index 95% rename from docs/source/user-guide/expressions/user-defined-functions.md rename to docs/source/user-guide/expressions/user-defined-python-functions.md index dc994148c63b..b99a413e8bdd 100644 --- a/docs/source/user-guide/expressions/user-defined-functions.md +++ b/docs/source/user-guide/expressions/user-defined-python-functions.md @@ -1,4 +1,6 @@ -# User-defined functions (Python) +# User-defined Python functions + + Polars expressions are quite powerful and flexible, so there is much less need for custom Python functions compared to other libraries. Still, you may need to pass an expression's state to a third party library or apply your black box function to data in Polars. @@ -119,10 +121,10 @@ Passing the full `Series` to the user-defined function has a cost: it may use a You can use the `is_elementwise=True` argument to [:material-api: `map_batches`](https://docs.pola.rs/py-polars/html/reference/expressions/api/polars.Expr.map_batches.html) to stream results into the function, which means it might not get all values at once. !!! note -The `is_elementwise` argument can lead to incorrect results if set incorrectly. -If you set `is_elementwise=True`, make sure that your function actually operates -element-by-element (e.g. "calculate the logarithm of each value") - our example function `diff_from_mean()`, -for instance, does not. + + The `is_elementwise` argument can lead to incorrect results if set incorrectly. + If you set `is_elementwise=True`, make sure that your function actually operates + element-by-element (e.g. "calculate the logarithm of each value") - our example function `diff_from_mean()`, for instance, does not. ## Return types diff --git a/docs/source/user-guide/expressions/window-functions.md b/docs/source/user-guide/expressions/window-functions.md new file mode 100644 index 000000000000..7e8812d2dd10 --- /dev/null +++ b/docs/source/user-guide/expressions/window-functions.md @@ -0,0 +1,147 @@ +# Window functions + +Window functions are expressions with superpowers. +They allow you to perform aggregations on groups within the context `select`. +Let's get a feel for what that means. + +First, we load a Pokémon dataset: + +{{code_block('user-guide/expressions/window','pokemon',['read_csv'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:pokemon" +``` + +## Operations per group + +Window functions are ideal when we want to perform an operation within a group. +For instance, suppose we want to rank our Pokémon by the column “Speed”. +However, instead of a global ranking, we want to rank the speed within each group defined by the column “Type 1”. +We write the expression to rank the data by the column “Speed” and then we add the function `over` to specify that this should happen over the unique values of the column “Type 1”: + +{{code_block('user-guide/expressions/window','rank',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:rank" +``` + +To help visualise this operation, you may imagine that Polars selects the subsets of the data that share the same value for the column “Type 1” and then computes the ranking expression only for those values. +Then, the results for that specific group are projected back to the original rows and Polars does this for all of the existing groups. +The diagram below highlights the ranking computation for the Pokémon with “Type 1” equal to “Grass”. + +
+--8<-- "docs/source/user-guide/expressions/speed_rank_by_type.svg" +
+ +Note how the row for the Pokémon “Goldbat” has a “Speed” value of `90`, which is greater than the value `80` of the Pokémon “Venusaur”, and yet the latter was ranked 1 because “Goldbat” and “Venusar” do not share the same value for the column “Type 1”. + +The function `over` accepts an arbitrary number of expressions to specify the groups over which to perform the computations. +We can repeat the ranking above, but over the combination of the columns “Type 1” and “Type 2” for a more fine-grained ranking: + +{{code_block('user-guide/expressions/window','rank-multiple',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:rank-multiple" +``` + +In general, the results you get with the function `over` can also be achieved with [an aggregation](aggregation.md) followed by a call to the function `explode`, although the rows would be in a different order: + +{{code_block('user-guide/expressions/window','rank-explode',['explode'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:rank-explode" +``` + +This shows that, usually, `group_by` and `over` produce results of different shapes: + +- `group_by` usually produces a resulting dataframe with as many rows as groups used for aggregating; and +- `over` usually produces a dataframe with the same number of rows as the original. + +The function `over` does not always produce results with the same number of rows as the original dataframe, and that is what we explore next. + +## Mapping results to dataframe rows + +The function `over` accepts a parameter `mapping_strategy` that determines how the results of the expression over the group are mapped back to the rows of the dataframe. + +### `group_to_rows` + +The default behaviour is `"group_to_rows"`: +the result of the expression over the group should be the same length as the group and the results are mapped back to the rows of that group. + +If the order of the rows is not relevant, the option `"explode"` is more performant. +Instead of mapping the resulting values to the original rows, Polars creates a new dataframe where values from the same group are next to each other. +To help understand the distinction, consider the following dataframe: + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:athletes" +``` + +We can sort the athletes by rank within their own countries. +If we do so, the Dutch athletes were in the second, third, and sixth, rows, and they will remain there. +What will change is the order of the names of the athletes, which goes from “B”, “C”, and “F”, to “B”, “F”, and “C”: + +{{code_block('user-guide/expressions/window','athletes-sort-over-country',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:athletes-sort-over-country" +``` + +The diagram below represents this transformation: + +
+--8<-- "docs/source/user-guide/expressions/athletes_over_country.svg" +
+ +### `explode` + +If we set the parameter `mapping_strategy` to `"explode"`, then athletes of the same country are grouped together, but the final order of the rows – with respect to the countries – will not be the same, as the diagram shows: + +
+--8<-- "docs/source/user-guide/expressions/athletes_over_country_explode.svg" +
+ +Because Polars does not need to keep track of the positions of the rows of each group, using `"explode"` is typically faster than `"group_to_rows"`. +However, using `"explode"` also requires more care because it implies reordering the other columns that we wish to keep. +The code that produces this result follows + +{{code_block('user-guide/expressions/window','athletes-explode',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:athletes-explode" +``` + +### `join` + +Another possible value for the parameter `mapping_strategy` is `"join"`, which aggregates the resulting values in a list and repeats the list over all rows of the same group: + +{{code_block('user-guide/expressions/window','athletes-join',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:athletes-join" +``` + +## Windowed aggregation expressions + +In case the expression applied to the values of a group produces a scalar value, the scalar is broadcast across the rows of the group: + +{{code_block('user-guide/expressions/window','pokemon-mean',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:pokemon-mean" +``` + +## More examples + +For more exercises, below are some window functions for us to compute: + +- sort all Pokémon by type; +- select the first `3` Pokémon per type as `"Type 1"`; +- sort the Pokémon within a type by speed in descending order and select the first `3` as `"fastest/group"`; +- sort the Pokémon within a type by attack in descending order and select the first `3` as `"strongest/group"`; and +- sort the Pokémon within a type by name and select the first `3` as `"sorted_by_alphabet"`. + +{{code_block('user-guide/expressions/window','examples',['over'])}} + +```python exec="on" result="text" session="user-guide/window" +--8<-- "python/user-guide/expressions/window.py:examples" +``` diff --git a/docs/source/user-guide/expressions/window.md b/docs/source/user-guide/expressions/window.md deleted file mode 100644 index 261dac180c4d..000000000000 --- a/docs/source/user-guide/expressions/window.md +++ /dev/null @@ -1,91 +0,0 @@ -# Window functions - -Window functions are expressions with superpowers. They allow you to perform aggregations on groups in the -`select` context. Let's get a feel for what that means. First we create a dataset. The dataset loaded in the -snippet below contains information about pokemon: - -{{code_block('user-guide/expressions/window','pokemon',['read_csv'])}} - -```python exec="on" result="text" session="user-guide/window" ---8<-- "python/user-guide/expressions/window.py:pokemon" -``` - -## Group by aggregations in selection - -Below we show how to use window functions to group over different columns and perform an aggregation on them. -Doing so allows us to use multiple group by operations in parallel, using a single query. The results of the aggregation -are projected back to the original rows. Therefore, a window function will almost always lead to a `DataFrame` with the same size as the original. - -We will discuss later the cases where a window function can change the numbers of rows in a `DataFrame`. - -Note how we call `.over("Type 1")` and `.over(["Type 1", "Type 2"])`. Using window functions we can aggregate over different groups in a single `select` call! Note that, in Rust, the type of the argument to `over()` must be a collection, so even when you're only using one column, you must provide it in an array. - -The best part is, this won't cost you anything. The computed groups are cached and shared between different `window` expressions. - -{{code_block('user-guide/expressions/window','group_by',['over'])}} - -```python exec="on" result="text" session="user-guide/window" ---8<-- "python/user-guide/expressions/window.py:group_by" -``` - -## Operations per group - -Window functions can do more than aggregation. They can also be viewed as an operation within a group. If, for instance, you -want to `sort` the values within a `group`, you can write `col("value").sort().over("group")` and voilà! We sorted by group! - -Let's filter out some rows to make this more clear. - -{{code_block('user-guide/expressions/window','operations',['filter'])}} - -```python exec="on" result="text" session="user-guide/window" ---8<-- "python/user-guide/expressions/window.py:operations" -``` - -Observe that the group `Water` of column `Type 1` is not contiguous. There are two rows of `Grass` in between. Also note -that each pokemon within a group are sorted by `Speed` in `ascending` order. Unfortunately, for this example we want them sorted in -`descending` speed order. Luckily with window functions this is easy to accomplish. - -{{code_block('user-guide/expressions/window','sort',['over'])}} - -```python exec="on" result="text" session="user-guide/window" ---8<-- "python/user-guide/expressions/window.py:sort" -``` - -Polars keeps track of each group's location and maps the expressions to the proper row locations. This will also work over different groups in a single `select`. - -The power of window expressions is that you often don't need a `group_by -> explode` combination, but you can put the logic in a single expression. It also makes the API cleaner. If properly used a: - -- `group_by` -> marks that groups are aggregated and we expect a `DataFrame` of size `n_groups` -- `over` -> marks that we want to compute something within a group, and doesn't modify the original size of the `DataFrame` except in specific cases - -## Map the expression result to the DataFrame rows - -In cases where the expression results in multiple values per group, the Window function has 3 strategies for linking the values back to the `DataFrame` rows: - -- `mapping_strategy = 'group_to_rows'` -> each value is assigned back to one row. The number of values returned should match the number of rows. - -- `mapping_strategy = 'join'` -> the values are imploded in a list, and the list is repeated on all rows. This can be memory intensive. - -- `mapping_strategy = 'explode'` -> the values are exploded to new rows. This operation changes the number of rows. - -## Window expression rules - -The evaluations of window expressions are as follows (assuming we apply it to a `pl.Int32` column): - -{{code_block('user-guide/expressions/window','rules',['over'])}} - -## More examples - -For more exercise, below are some window functions for us to compute: - -- sort all pokemon by type -- select the first `3` pokemon per type as `"Type 1"` -- sort the pokemon within a type by speed in descending order and select the first `3` as `"fastest/group"` -- sort the pokemon within a type by attack in descending order and select the first `3` as `"strongest/group"` -- sort the pokemon within a type by name and select the first `3` as `"sorted_by_alphabet"` - -{{code_block('user-guide/expressions/window','examples',['over'])}} - -```python exec="on" result="text" session="user-guide/window" ---8<-- "python/user-guide/expressions/window.py:examples" -``` diff --git a/docs/source/user-guide/getting-started.md b/docs/source/user-guide/getting-started.md index e571ea71cca1..b0c18b2562b1 100644 --- a/docs/source/user-guide/getting-started.md +++ b/docs/source/user-guide/getting-started.md @@ -83,7 +83,7 @@ When using expression expansion you can use `.name.suffix` to add a suffix to th --8<-- "python/user-guide/getting-started.py:expression-expansion" ``` -You can check other sections of the user guide to learn more about [basic operations](expressions/operators.md) or [column selections](expressions/column-selections.md). +You can check other sections of the user guide to learn more about [basic operations](expressions/basic-operations.md) or [column selections in expression expansion](expressions/expression-expansion.md). ### `with_columns` diff --git a/docs/source/user-guide/installation.md b/docs/source/user-guide/installation.md index fdfe83d49dee..0cecd7cd5f4b 100644 --- a/docs/source/user-guide/installation.md +++ b/docs/source/user-guide/installation.md @@ -23,8 +23,8 @@ Polars is a library and installation is as simple as invoking the package manage ## Big Index -By default, Polars dataframes are limited to 232 rows (~4.3 billion). -Increase this limit to 264 (~18 quintillion) by enabling the big index extension: +By default, Polars dataframes are limited to $2^{32}$ rows (~4.3 billion). +Increase this limit to $2^{64}$ (~18 quintillion) by enabling the big index extension: === ":fontawesome-brands-python: Python" @@ -196,7 +196,7 @@ The opt-in features are: - Performance related: - `nightly` - Several nightly only features such as SIMD and specialization. - `performant` - more fast paths, slower compile times. - - `bigidx` - Activate this feature if you expect >> 232 rows. + - `bigidx` - Activate this feature if you expect >> $2^{32}$ rows. This allows polars to scale up way beyond that by using `u64` as an index. Polars will be a bit slower with this feature activated as many data structures are less cache efficient. diff --git a/docs/source/user-guide/transformations/index.md b/docs/source/user-guide/transformations/index.md index 3092c5be3c37..fa86181eb58d 100644 --- a/docs/source/user-guide/transformations/index.md +++ b/docs/source/user-guide/transformations/index.md @@ -2,6 +2,8 @@ The focus of this section is to describe different types of data transformations and provide some examples on how to use them. + + - [Joins](joins.md) - [Concatenation](concatenation.md) - [Pivot](pivot.md) diff --git a/mkdocs.yml b/mkdocs.yml index c180bbfc6b8e..62ca12edbe97 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -21,21 +21,20 @@ nav: - user-guide/concepts/lazy-api.md - Expressions: - user-guide/expressions/index.md - - user-guide/expressions/operators.md - - user-guide/expressions/column-selections.md - - user-guide/expressions/functions.md + - user-guide/expressions/basic-operations.md + - user-guide/expressions/expression-expansion.md - user-guide/expressions/casting.md - user-guide/expressions/strings.md + - user-guide/expressions/lists-and-arrays.md - user-guide/expressions/categorical-data-and-enums.md - - user-guide/expressions/aggregation.md + - user-guide/expressions/structs.md - user-guide/expressions/missing-data.md - - user-guide/expressions/window.md + - user-guide/expressions/aggregation.md + - user-guide/expressions/window-functions.md - user-guide/expressions/folds.md - - user-guide/expressions/lists.md - - user-guide/expressions/plugins.md - - user-guide/expressions/user-defined-functions.md - - user-guide/expressions/structs.md - - user-guide/expressions/numpy.md + - user-guide/expressions/plugins-for-custom-expressions.md + - user-guide/expressions/user-defined-python-functions.md + - user-guide/expressions/numpy-functions.md - Transformations: - user-guide/transformations/index.md - user-guide/transformations/joins.md @@ -144,6 +143,9 @@ theme: icon: repo: fontawesome/brands/github +extra_javascript: + - _build/js/mathjax.js + - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js extra_css: - _build/css/extra.css extra: @@ -170,6 +172,8 @@ markdown_extensions: check_paths: true dedent_subsections: true - footnotes + - pymdownx.arithmatex: + generic: true hooks: - docs/source/_build/scripts/people.py diff --git a/py-polars/docs/source/_static/css/custom.css b/py-polars/docs/source/_static/css/custom.css index 7797c1fa0e15..966f1a86d21e 100644 --- a/py-polars/docs/source/_static/css/custom.css +++ b/py-polars/docs/source/_static/css/custom.css @@ -1,5 +1,5 @@ /* To have blue background of width of the block (instead of width of content) */ -dl.class > dt:first-of-type { +dl.class>dt:first-of-type { display: block !important; }