diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index f14abd3e9f..7a5d5b4de2 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -3586,14 +3586,23 @@ def implementation(self) -> Implementation: Examples: >>> import narwhals as nw >>> import polars as pl - >>> lf_native = pl.LazyFrame({"a": [1, 2, 3]}) - >>> lf = nw.from_native(lf_native) + >>> import dask.dataframe as dd + >>> lf_pl = pl.LazyFrame({"a": [1, 2, 3]}) + >>> lf_dask = dd.from_dict({"a": [1, 2, 3]}, npartitions=2) + + >>> lf = nw.from_native(lf_pl) >>> lf.implementation >>> lf.implementation.is_pandas() False >>> lf.implementation.is_polars() True + + >>> lf = nw.from_native(lf_dask) + >>> lf.implementation + + >>> lf.implementation.is_dask() + True """ return self._compliant_frame._implementation # type: ignore[no-any-return] @@ -3610,13 +3619,15 @@ def collect(self) -> DataFrame[Any]: Examples: >>> import narwhals as nw >>> import polars as pl - >>> lf_pl = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) + >>> import dask.dataframe as dd + >>> data = { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) + >>> lf = nw.from_native(lf_pl) >>> lf # doctest:+ELLIPSIS ┌─────────────────────────────┐ @@ -3636,6 +3647,27 @@ def collect(self) -> DataFrame[Any]: │ b ┆ 11 ┆ 10 │ │ c ┆ 6 ┆ 1 │ └─────┴─────┴─────┘ + + >>> lf = nw.from_native(lf_dask) + >>> lf + ┌───────────────────────────────────┐ + | Narwhals LazyFrame | + |-----------------------------------| + |Dask DataFrame Structure: | + | a b c| + |npartitions=2 | + |0 string int64 int64| + |3 ... ... ...| + |5 ... ... ...| + |Dask Name: frompandas, 1 expression| + |Expr=df | + └───────────────────────────────────┘ + >>> df = lf.group_by("a").agg(nw.col("b", "c").sum()).collect() + >>> df.to_native() + a b c + 0 a 4 10 + 1 b 11 10 + 2 c 6 1 """ return self._dataframe( self._compliant_frame.collect(), @@ -3649,23 +3681,16 @@ def to_native(self) -> FrameT: Object of class that user started with. Examples: - >>> import pandas as pd >>> import polars as pl - >>> import pyarrow as pa + >>> import dask.dataframe as dd >>> import narwhals as nw >>> >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) - >>> df_pa = pa.table(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) - Calling `to_native` on a Narwhals DataFrame returns the native object: + Calling `to_native` on a Narwhals LazyFrame returns the native object: - >>> nw.from_native(df_pd).lazy().to_native() - foo bar ham - 0 1 6.0 a - 1 2 7.0 b - 2 3 8.0 c >>> nw.from_native(lf_pl).to_native().collect() shape: (3, 3) ┌─────┬─────┬─────┐ @@ -3677,6 +3702,11 @@ def to_native(self) -> FrameT: │ 2 ┆ 7.0 ┆ b │ │ 3 ┆ 8.0 ┆ c │ └─────┴─────┴─────┘ + >>> nw.from_native(lf_dask).to_native().compute() + foo bar ham + 0 1 6.0 a + 1 2 7.0 b + 2 3 8.0 c """ return to_native(narwhals_object=self, pass_through=False) @@ -3694,28 +3724,23 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se Examples: >>> import polars as pl - >>> import pandas as pd + >>> import dask.dataframe as dd >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2, 3], "ba": [4, 5, 6]} - >>> df_pd = pd.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) Let's define a dataframe-agnostic function: >>> def agnostic_pipe(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.pipe(lambda _df: _df.select("a")).to_native() + ... return df.pipe(lambda _df: _df.select("a")).collect().to_native() - We can then pass either pandas or Polars: + We can then pass any supported library such as Polars or Dask to `agnostic_pipe`: - >>> agnostic_pipe(df_pd) - a - 0 1 - 1 2 - 2 3 - >>> agnostic_pipe(lf_pl).collect() + >>> agnostic_pipe(lf_pl) shape: (3, 1) ┌─────┐ │ a │ @@ -3726,6 +3751,11 @@ def pipe(self, function: Callable[[Any], Self], *args: Any, **kwargs: Any) -> Se │ 2 │ │ 3 │ └─────┘ + >>> agnostic_pipe(lf_dask) + a + 0 1 + 1 2 + 2 3 """ return super().pipe(function, *args, **kwargs) @@ -3746,26 +3776,23 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: Examples: >>> import polars as pl - >>> import pandas as pd + >>> import dask.dataframe as dd >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1.0, 2.0, None], "ba": [1.0, None, 2.0]} - >>> df_pd = pd.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) Let's define a dataframe-agnostic function: >>> def agnostic_drop_nulls(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.drop_nulls().to_native() + ... return df.drop_nulls().collect().to_native() - We can then pass any supported library such as Pandas or Polars to `agnostic_drop_nulls`: + We can then pass any supported library such as Polars or Dask to `agnostic_drop_nulls`: - >>> agnostic_drop_nulls(df_pd) - a ba - 0 1.0 1.0 - >>> agnostic_drop_nulls(lf_pl).collect() + >>> agnostic_drop_nulls(lf_pl) shape: (1, 2) ┌─────┬─────┐ │ a ┆ ba │ @@ -3774,6 +3801,9 @@ def drop_nulls(self: Self, subset: str | list[str] | None = None) -> Self: ╞═════╪═════╡ │ 1.0 ┆ 1.0 │ └─────┴─────┘ + >>> agnostic_drop_nulls(lf_dask) + a ba + 0 1.0 1.0 """ return super().drop_nulls(subset=subset) @@ -3788,28 +3818,23 @@ def with_row_index(self, name: str = "index") -> Self: Examples: >>> import polars as pl - >>> import pandas as pd + >>> import dask.dataframe as dd >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2, 3], "b": [4, 5, 6]} - >>> df_pd = pd.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) Let's define a dataframe-agnostic function: >>> def agnostic_with_row_index(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.with_row_index().to_native() + ... return df.with_row_index().collect().to_native() - We can then pass either pandas or Polars: + We can then pass any supported library such as Polars or Dask to `agnostic_with_row_index`: - >>> agnostic_with_row_index(df_pd) - index a b - 0 0 1 4 - 1 1 2 5 - 2 2 3 6 - >>> agnostic_with_row_index(lf_pl).collect() + >>> agnostic_with_row_index(lf_pl) shape: (3, 3) ┌───────┬─────┬─────┐ │ index ┆ a ┆ b │ @@ -3820,6 +3845,11 @@ def with_row_index(self, name: str = "index") -> Self: │ 1 ┆ 2 ┆ 5 │ │ 2 ┆ 3 ┆ 6 │ └───────┴─────┴─────┘ + >>> agnostic_with_row_index(lf_dask) + index a b + 0 0 1 4 + 1 1 2 5 + 2 2 3 6 """ return super().with_row_index(name) @@ -3832,17 +3862,23 @@ def schema(self) -> Schema: Examples: >>> import polars as pl + >>> import dask.dataframe as dd >>> import narwhals as nw - >>> lf_pl = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) + >>> data = { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) + >>> lf = nw.from_native(lf_pl) >>> lf.schema # doctest: +SKIP - Schema({'foo': Int64, 'bar': Float64, 'ham', String}) + Schema({'foo': Int64, 'bar': Float64, 'ham': String}) + + >>> lf = nw.from_native(lf_dask) + >>> lf.schema # doctest: +SKIP + Schema({'foo': Int64, 'bar': Float64, 'ham': String}) """ return super().schema @@ -3854,17 +3890,23 @@ def collect_schema(self: Self) -> Schema: Examples: >>> import polars as pl + >>> import dask.dataframe as dd >>> import narwhals as nw - >>> lf_pl = pl.LazyFrame( - ... { - ... "foo": [1, 2, 3], - ... "bar": [6.0, 7.0, 8.0], - ... "ham": ["a", "b", "c"], - ... } - ... ) + >>> data = { + ... "foo": [1, 2, 3], + ... "bar": [6.0, 7.0, 8.0], + ... "ham": ["a", "b", "c"], + ... } + >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) + >>> lf = nw.from_native(lf_pl) >>> lf.collect_schema() Schema({'foo': Int64, 'bar': Float64, 'ham': String}) + + >>> lf = nw.from_native(lf_dask) + >>> lf.collect_schema() + Schema({'foo': Int64, 'bar': Float64, 'ham': String}) """ return super().collect_schema() @@ -3876,14 +3918,14 @@ def columns(self) -> list[str]: The column names stored in a list. Examples: - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> import narwhals as nw >>> from narwhals.typing import IntoFrame >>> - >>> df = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(df) - >>> lf_pl = pl.LazyFrame(df) + >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} + >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) We define a library agnostic function: @@ -3891,12 +3933,12 @@ def columns(self) -> list[str]: ... df = nw.from_native(df_native) ... return df.columns - We can then pass either pandas or Polars to `agnostic_columns`: + We can then pass any supported library such as Polars or Dask to `agnostic_columns`: - >>> agnostic_columns(df_pd) - ['foo', 'bar', 'ham'] >>> agnostic_columns(lf_pl) # doctest: +SKIP ['foo', 'bar', 'ham'] + >>> agnostic_columns(lf_dask) + ['foo', 'bar', 'ham'] """ return super().columns @@ -3923,48 +3965,31 @@ def with_columns( existing data. Examples: - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> - >>> df = { + >>> data = { ... "a": [1, 2, 3, 4], ... "b": [0.5, 4, 10, 13], ... "c": [True, True, False, True], ... } - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> lf_pl = pl.LazyFrame(df) + >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) Let's define a dataframe-agnostic function in which we pass an expression to add it as a new column: >>> def agnostic_with_columns(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.with_columns((nw.col("a") * 2).alias("2a")).to_native() + ... return ( + ... df.with_columns((nw.col("a") * 2).alias("2a")).collect().to_native() + ... ) - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Polars or Dask to `agnostic_with_columns`: - >>> agnostic_with_columns(df_pd) - a b c 2a - 0 1 0.5 True 2 - 1 2 4.0 True 4 - 2 3 10.0 False 6 - 3 4 13.0 True 8 - >>> agnostic_with_columns(df_pl) - shape: (4, 4) - ┌─────┬──────┬───────┬─────┐ - │ a ┆ b ┆ c ┆ 2a │ - │ --- ┆ --- ┆ --- ┆ --- │ - │ i64 ┆ f64 ┆ bool ┆ i64 │ - ╞═════╪══════╪═══════╪═════╡ - │ 1 ┆ 0.5 ┆ true ┆ 2 │ - │ 2 ┆ 4.0 ┆ true ┆ 4 │ - │ 3 ┆ 10.0 ┆ false ┆ 6 │ - │ 4 ┆ 13.0 ┆ true ┆ 8 │ - └─────┴──────┴───────┴─────┘ - >>> agnostic_with_columns(lf_pl).collect() + >>> agnostic_with_columns(lf_pl) shape: (4, 4) ┌─────┬──────┬───────┬─────┐ │ a ┆ b ┆ c ┆ 2a │ @@ -3976,6 +4001,12 @@ def with_columns( │ 3 ┆ 10.0 ┆ false ┆ 6 │ │ 4 ┆ 13.0 ┆ true ┆ 8 │ └─────┴──────┴───────┴─────┘ + >>> agnostic_with_columns(lf_dask) + a b c 2a + 0 1 0.5 True 2 + 1 2 4.0 True 4 + 2 3 10.0 False 6 + 3 4 13.0 True 8 """ return super().with_columns(*exprs, **named_exprs) @@ -4002,46 +4033,29 @@ def select( `0` use `df.select(nw.col(0))`, not `df.select(0)`. Examples: - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> - >>> df = { + >>> data = { ... "foo": [1, 2, 3], ... "bar": [6, 7, 8], ... "ham": ["a", "b", "c"], ... } - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> lf_pl = pl.LazyFrame(df) + >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) Let's define a dataframe-agnostic function in which we pass the name of a column to select that column. >>> def agnostic_select(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.select("foo").to_native() + ... return df.select("foo").collect().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Polars or Dask to `agnostic_select`: - >>> agnostic_select(df_pd) - foo - 0 1 - 1 2 - 2 3 - >>> agnostic_select(df_pl) - shape: (3, 1) - ┌─────┐ - │ foo │ - │ --- │ - │ i64 │ - ╞═════╡ - │ 1 │ - │ 2 │ - │ 3 │ - └─────┘ - >>> agnostic_select(lf_pl).collect() + >>> agnostic_select(lf_pl) shape: (3, 1) ┌─────┐ │ foo │ @@ -4052,30 +4066,19 @@ def select( │ 2 │ │ 3 │ └─────┘ + >>> agnostic_select(lf_dask) + foo + 0 1 + 1 2 + 2 3 Multiple columns can be selected by passing a list of column names. >>> def agnostic_select(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.select(["foo", "bar"]).to_native() - >>> - >>> agnostic_select(df_pd) - foo bar - 0 1 6 - 1 2 7 - 2 3 8 - >>> agnostic_select(df_pl) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 6 │ - │ 2 ┆ 7 │ - │ 3 ┆ 8 │ - └─────┴─────┘ - >>> agnostic_select(lf_pl).collect() + ... return df.select(["foo", "bar"]).collect().to_native() + + >>> agnostic_select(lf_pl) shape: (3, 2) ┌─────┬─────┐ │ foo ┆ bar │ @@ -4086,31 +4089,20 @@ def select( │ 2 ┆ 7 │ │ 3 ┆ 8 │ └─────┴─────┘ + >>> agnostic_select(lf_dask) + foo bar + 0 1 6 + 1 2 7 + 2 3 8 Multiple columns can also be selected using positional arguments instead of a list. Expressions are also accepted. >>> def agnostic_select(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.select(nw.col("foo"), nw.col("bar") + 1).to_native() - >>> - >>> agnostic_select(df_pd) - foo bar - 0 1 7 - 1 2 8 - 2 3 9 - >>> agnostic_select(df_pl) - shape: (3, 2) - ┌─────┬─────┐ - │ foo ┆ bar │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - >>> agnostic_select(lf_pl).collect() + ... return df.select(nw.col("foo"), nw.col("bar") + 1).collect().to_native() + + >>> agnostic_select(lf_pl) shape: (3, 2) ┌─────┬─────┐ │ foo ┆ bar │ @@ -4121,30 +4113,19 @@ def select( │ 2 ┆ 8 │ │ 3 ┆ 9 │ └─────┴─────┘ + >>> agnostic_select(lf_dask) + foo bar + 0 1 7 + 1 2 8 + 2 3 9 Use keyword arguments to easily name your expression inputs. >>> def agnostic_select(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.select(threshold=nw.col("foo") * 2).to_native() - >>> - >>> agnostic_select(df_pd) - threshold - 0 2 - 1 4 - 2 6 - >>> agnostic_select(df_pl) - shape: (3, 1) - ┌───────────┐ - │ threshold │ - │ --- │ - │ i64 │ - ╞═══════════╡ - │ 2 │ - │ 4 │ - │ 6 │ - └───────────┘ - >>> agnostic_select(lf_pl).collect() + ... return df.select(threshold=nw.col("foo") * 2).collect().to_native() + + >>> agnostic_select(lf_pl) shape: (3, 1) ┌───────────┐ │ threshold │ @@ -4155,6 +4136,11 @@ def select( │ 4 │ │ 6 │ └───────────┘ + >>> agnostic_select(lf_dask) + threshold + 0 2 + 1 4 + 2 6 """ return super().select(*exprs, **named_exprs) @@ -4170,29 +4156,24 @@ def rename(self, mapping: dict[str, str]) -> Self: The LazyFrame with the specified columns renamed. Examples: - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> >>> data = {"foo": [1, 2, 3], "bar": [6, 7, 8], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) We define a library agnostic function: >>> def agnostic_rename(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.rename({"foo": "apple"}).to_native() + ... return df.rename({"foo": "apple"}).collect().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Polars or Dask to `agnostic_rename`: - >>> agnostic_rename(df_pd) - apple bar ham - 0 1 6 a - 1 2 7 b - 2 3 8 c - >>> agnostic_rename(lf_pl).collect() + >>> agnostic_rename(lf_pl) shape: (3, 3) ┌───────┬─────┬─────┐ │ apple ┆ bar ┆ ham │ @@ -4203,6 +4184,11 @@ def rename(self, mapping: dict[str, str]) -> Self: │ 2 ┆ 7 ┆ b │ │ 3 ┆ 8 ┆ c │ └───────┴─────┴─────┘ + >>> agnostic_rename(lf_dask) + apple bar ham + 0 1 6 a + 1 2 7 b + 2 3 8 c """ return super().rename(mapping) @@ -4217,43 +4203,26 @@ def head(self, n: int = 5) -> Self: Examples: >>> import narwhals as nw - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> from narwhals.typing import IntoFrameT >>> >>> data = { ... "a": [1, 2, 3, 4, 5, 6], ... "b": [7, 8, 9, 10, 11, 12], ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) Let's define a dataframe-agnostic function that gets the first 3 rows. >>> def agnostic_head(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.head(3).to_native() + ... return df.head(3).collect().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Polars or Dask to `agnostic_head`: - >>> agnostic_head(df_pd) - a b - 0 1 7 - 1 2 8 - 2 3 9 - >>> agnostic_head(df_pl) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 1 ┆ 7 │ - │ 2 ┆ 8 │ - │ 3 ┆ 9 │ - └─────┴─────┘ - >>> agnostic_head(lf_pl).collect() + >>> agnostic_head(lf_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -4264,6 +4233,11 @@ def head(self, n: int = 5) -> Self: │ 2 ┆ 8 │ │ 3 ┆ 9 │ └─────┴─────┘ + >>> agnostic_head(lf_dask) + a b + 0 1 7 + 1 2 8 + 2 3 9 """ return super().head(n) @@ -4276,45 +4250,32 @@ def tail(self, n: int = 5) -> Self: Returns: A subset of the LazyFrame of shape (n, n_columns). + Notes: + `LazyFrame.tail` is not supported for the Dask backend with multiple + partitions. + Examples: >>> import narwhals as nw - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> from narwhals.typing import IntoFrameT >>> >>> data = { ... "a": [1, 2, 3, 4, 5, 6], ... "b": [7, 8, 9, 10, 11, 12], ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=1) Let's define a dataframe-agnostic function that gets the last 3 rows. >>> def agnostic_tail(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.tail(3).to_native() + ... return df.tail(3).collect().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Polars or Dask to `agnostic_tail`: - >>> agnostic_tail(df_pd) - a b - 3 4 10 - 4 5 11 - 5 6 12 - >>> agnostic_tail(df_pl) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 4 ┆ 10 │ - │ 5 ┆ 11 │ - │ 6 ┆ 12 │ - └─────┴─────┘ - >>> agnostic_tail(lf_pl).collect() + >>> agnostic_tail(lf_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -4325,6 +4286,11 @@ def tail(self, n: int = 5) -> Self: │ 5 ┆ 11 │ │ 6 ┆ 12 │ └─────┴─────┘ + >>> agnostic_tail(lf_dask) + a b + 3 4 10 + 4 5 11 + 5 6 12 """ return super().tail(n) @@ -4345,29 +4311,24 @@ def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: Please consider upgrading to a newer version or pass to eager mode. Examples: - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> >>> data = {"foo": [1, 2, 3], "bar": [6.0, 7.0, 8.0], "ham": ["a", "b", "c"]} - >>> df_pd = pd.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) We define a library agnostic function: >>> def agnostic_drop(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.drop("ham").to_native() + ... return df.drop("ham").collect().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Polars or Dask to `agnostic_drop`: - >>> agnostic_drop(df_pd) - foo bar - 0 1 6.0 - 1 2 7.0 - 2 3 8.0 - >>> agnostic_drop(lf_pl).collect() + >>> agnostic_drop(lf_pl) shape: (3, 2) ┌─────┬─────┐ │ foo ┆ bar │ @@ -4378,19 +4339,19 @@ def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: │ 2 ┆ 7.0 │ │ 3 ┆ 8.0 │ └─────┴─────┘ + >>> agnostic_drop(lf_dask) + foo bar + 0 1 6.0 + 1 2 7.0 + 2 3 8.0 Use positional arguments to drop multiple columns. >>> def agnostic_drop(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.drop("foo", "ham").to_native() + ... return df.drop("foo", "ham").collect().to_native() - >>> agnostic_drop(df_pd) - bar - 0 6.0 - 1 7.0 - 2 8.0 - >>> agnostic_drop(lf_pl).collect() + >>> agnostic_drop(lf_pl) shape: (3, 1) ┌─────┐ │ bar │ @@ -4401,6 +4362,11 @@ def drop(self, *columns: str | Iterable[str], strict: bool = True) -> Self: │ 7.0 │ │ 8.0 │ └─────┘ + >>> agnostic_drop(lf_dask) + bar + 0 6.0 + 1 7.0 + 2 8.0 """ return super().drop(*flatten(columns), strict=strict) @@ -4425,11 +4391,11 @@ def unique( maintain_order: Has no effect and is kept around only for backwards-compatibility. Returns: - LazyFrame: LazyFrame with unique rows. + The LazyFrame with unique rows. Examples: - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> @@ -4438,21 +4404,18 @@ def unique( ... "bar": ["a", "a", "a", "a"], ... "ham": ["b", "b", "b", "b"], ... } - >>> df_pd = pd.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) We define a library agnostic function: >>> def agnostic_unique(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.unique(["bar", "ham"]).to_native() + ... return df.unique(["bar", "ham"]).collect().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Polars or Dask to `agnostic_unique`: - >>> agnostic_unique(df_pd) - foo bar ham - 0 1 a b - >>> agnostic_unique(lf_pl).collect() + >>> agnostic_unique(lf_pl) shape: (1, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ @@ -4461,6 +4424,9 @@ def unique( ╞═════╪═════╪═════╡ │ 1 ┆ a ┆ b │ └─────┴─────┴─────┘ + >>> agnostic_unique(lf_dask) + foo bar ham + 0 1 a b """ if keep not in {"any", "none"}: msg = ( @@ -4501,8 +4467,8 @@ def filter( The filtered LazyFrame. Examples: - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> @@ -4511,34 +4477,19 @@ def filter( ... "bar": [6, 7, 8], ... "ham": ["a", "b", "c"], ... } - >>> df_pd = pd.DataFrame(data) - >>> df_pl = pl.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) Let's define a dataframe-agnostic function in which we filter on one condition. >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.filter(nw.col("foo") > 1).to_native() + ... return df.filter(nw.col("foo") > 1).collect().to_native() - We can then pass either pandas or Polars to `agnostic_filter`: + We can then pass any supported library such as Polars or Dask to `agnostic_filter`: - >>> agnostic_filter(df_pd) - foo bar ham - 1 2 7 b - 2 3 8 c - >>> agnostic_filter(df_pl) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - >>> agnostic_filter(lf_pl).collect() + >>> agnostic_filter(lf_pl) shape: (2, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ @@ -4548,26 +4499,22 @@ def filter( │ 2 ┆ 7 ┆ b │ │ 3 ┆ 8 ┆ c │ └─────┴─────┴─────┘ + >>> agnostic_filter(lf_dask) + foo bar ham + 1 2 7 b + 2 3 8 c Filter on multiple conditions: >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.filter((nw.col("foo") < 3) & (nw.col("ham") == "a")).to_native() - >>> - >>> agnostic_filter(df_pd) - foo bar ham - 0 1 6 a - >>> agnostic_filter(df_pl) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - >>> agnostic_filter(lf_pl).collect() + ... return ( + ... df.filter((nw.col("foo") < 3) & (nw.col("ham") == "a")) + ... .collect() + ... .to_native() + ... ) + + >>> agnostic_filter(lf_pl) shape: (1, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ @@ -4576,29 +4523,24 @@ def filter( ╞═════╪═════╪═════╡ │ 1 ┆ 6 ┆ a │ └─────┴─────┴─────┘ + >>> agnostic_filter(lf_dask) + foo bar ham + 0 1 6 a Provide multiple filters using `*args` syntax: >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.filter( - ... nw.col("foo") == 1, - ... nw.col("ham") == "a", - ... ).to_native() - >>> - >>> agnostic_filter(df_pd) - foo bar ham - 0 1 6 a - >>> agnostic_filter(df_pl) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - └─────┴─────┴─────┘ - >>> agnostic_filter(lf_pl).collect() + ... return ( + ... df.filter( + ... nw.col("foo") == 1, + ... nw.col("ham") == "a", + ... ) + ... .collect() + ... .to_native() + ... ) + + >>> agnostic_filter(lf_pl) shape: (1, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ @@ -4607,30 +4549,21 @@ def filter( ╞═════╪═════╪═════╡ │ 1 ┆ 6 ┆ a │ └─────┴─────┴─────┘ + >>> agnostic_filter(lf_dask) + foo bar ham + 0 1 6 a Filter on an OR condition: >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.filter( - ... (nw.col("foo") == 1) | (nw.col("ham") == "c") - ... ).to_native() - >>> - >>> agnostic_filter(df_pd) - foo bar ham - 0 1 6 a - 2 3 8 c - >>> agnostic_filter(df_pl) - shape: (2, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 1 ┆ 6 ┆ a │ - │ 3 ┆ 8 ┆ c │ - └─────┴─────┴─────┘ - >>> agnostic_filter(lf_pl).collect() + ... return ( + ... df.filter((nw.col("foo") == 1) | (nw.col("ham") == "c")) + ... .collect() + ... .to_native() + ... ) + + >>> agnostic_filter(lf_pl) shape: (2, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ @@ -4640,26 +4573,18 @@ def filter( │ 1 ┆ 6 ┆ a │ │ 3 ┆ 8 ┆ c │ └─────┴─────┴─────┘ + >>> agnostic_filter(lf_dask) + foo bar ham + 0 1 6 a + 2 3 8 c Provide multiple filters using `**kwargs` syntax: >>> def agnostic_filter(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.filter(foo=2, ham="b").to_native() - >>> - >>> agnostic_filter(df_pd) - foo bar ham - 1 2 7 b - >>> agnostic_filter(df_pl) - shape: (1, 3) - ┌─────┬─────┬─────┐ - │ foo ┆ bar ┆ ham │ - │ --- ┆ --- ┆ --- │ - │ i64 ┆ i64 ┆ str │ - ╞═════╪═════╪═════╡ - │ 2 ┆ 7 ┆ b │ - └─────┴─────┴─────┘ - >>> agnostic_filter(lf_pl).collect() + ... return df.filter(foo=2, ham="b").collect().to_native() + + >>> agnostic_filter(lf_pl) shape: (1, 3) ┌─────┬─────┬─────┐ │ foo ┆ bar ┆ ham │ @@ -4668,6 +4593,9 @@ def filter( ╞═════╪═════╪═════╡ │ 2 ┆ 7 ┆ b │ └─────┴─────┴─────┘ + >>> agnostic_filter(lf_dask) + foo bar ham + 1 2 7 b """ return super().filter(*predicates, **constraints) @@ -4684,52 +4612,41 @@ def group_by( included in the result. Returns: - LazyGroupBy: Object which can be used to perform aggregations. + Object which can be used to perform aggregations. Examples: Group by one column and call `agg` to compute the grouped sum of another column. - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> - >>> df = { + >>> data = { ... "a": ["a", "b", "a", "b", "c"], ... "b": [1, 2, 1, 3, 3], ... "c": [5, 4, 3, 2, 1], ... } - >>> df_pd = pd.DataFrame(df) - >>> df_pl = pl.DataFrame(df) - >>> lf_pl = pl.LazyFrame(df) + >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) Let's define a dataframe-agnostic function in which we group by one column and call `agg` to compute the grouped sum of another column. >>> def agnostic_group_by_agg(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.group_by("a").agg(nw.col("b").sum()).sort("a").to_native() + ... return ( + ... df.group_by("a") + ... .agg(nw.col("b").sum()) + ... .sort("a") + ... .collect() + ... .to_native() + ... ) - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Polars or Dask to `agnostic_group_by_agg`: - >>> agnostic_group_by_agg(df_pd) - a b - 0 a 2 - 1 b 5 - 2 c 3 - >>> agnostic_group_by_agg(df_pl) - shape: (3, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ str ┆ i64 │ - ╞═════╪═════╡ - │ a ┆ 2 │ - │ b ┆ 5 │ - │ c ┆ 3 │ - └─────┴─────┘ - >>> agnostic_group_by_agg(lf_pl).collect() + >>> agnostic_group_by_agg(lf_pl) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -4740,34 +4657,25 @@ def group_by( │ b ┆ 5 │ │ c ┆ 3 │ └─────┴─────┘ + >>> agnostic_group_by_agg(lf_dask) + a b + 0 a 2 + 1 b 5 + 2 c 3 Group by multiple columns by passing a list of column names. >>> def agnostic_group_by_agg(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return ( - ... df.group_by(["a", "b"]).agg(nw.max("c")).sort(["a", "b"]).to_native() + ... df.group_by(["a", "b"]) + ... .agg(nw.max("c")) + ... .sort(["a", "b"]) + ... .collect() + ... .to_native() ... ) - >>> - >>> agnostic_group_by_agg(df_pd) - a b c - 0 a 1 5 - 1 b 2 4 - 2 b 3 2 - 3 c 3 1 - >>> agnostic_group_by_agg(df_pl) - shape: (4, 3) - ┌─────┬─────┬─────┐ - │ a ┆ b ┆ c │ - │ --- ┆ --- ┆ --- │ - │ str ┆ i64 ┆ i64 │ - ╞═════╪═════╪═════╡ - │ a ┆ 1 ┆ 5 │ - │ b ┆ 2 ┆ 4 │ - │ b ┆ 3 ┆ 2 │ - │ c ┆ 3 ┆ 1 │ - └─────┴─────┴─────┘ - >>> agnostic_group_by_agg(lf_pl).collect() + + >>> agnostic_group_by_agg(lf_pl) shape: (4, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -4779,6 +4687,12 @@ def group_by( │ b ┆ 3 ┆ 2 │ │ c ┆ 3 ┆ 1 │ └─────┴─────┴─────┘ + >>> agnostic_group_by_agg(lf_dask) + a b c + 0 a 1 5 + 1 b 2 4 + 2 b 3 2 + 3 c 3 1 """ from narwhals.expr import Expr from narwhals.group_by import LazyGroupBy @@ -4820,8 +4734,8 @@ def sort( Examples: >>> import narwhals as nw - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> from narwhals.typing import IntoFrameT >>> >>> data = { @@ -4829,24 +4743,19 @@ def sort( ... "b": [6.0, 5.0, 4.0], ... "c": ["a", "c", "b"], ... } - >>> df_pd = pd.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) Let's define a dataframe-agnostic function in which we sort by multiple columns in different orders >>> def agnostic_sort(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.sort("c", "a", descending=[False, True]).to_native() + ... return df.sort("c", "a", descending=[False, True]).collect().to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as Polars or Dask to `agnostic_sort`: - >>> agnostic_sort(df_pd) - a b c - 0 1.0 6.0 a - 2 NaN 4.0 b - 1 2.0 5.0 c - >>> agnostic_sort(lf_pl).collect() + >>> agnostic_sort(lf_pl) shape: (3, 3) ┌──────┬─────┬─────┐ │ a ┆ b ┆ c │ @@ -4857,6 +4766,11 @@ def sort( │ null ┆ 4.0 ┆ b │ │ 2 ┆ 5.0 ┆ c │ └──────┴─────┴─────┘ + >>> agnostic_sort(lf_dask) + a b c + 0 1.0 6.0 a + 2 NaN 4.0 b + 1 2.0 5.0 c """ return super().sort(by, *more_by, descending=descending, nulls_last=nulls_last) @@ -4892,8 +4806,8 @@ def join( Examples: >>> import narwhals as nw - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> from narwhals.typing import IntoFrameT >>> >>> data = { @@ -4906,11 +4820,10 @@ def join( ... "ham": ["a", "b", "d"], ... } - >>> df_pd = pd.DataFrame(data) - >>> other_pd = pd.DataFrame(data_other) - >>> lf_pl = pl.LazyFrame(data) >>> other_pl = pl.LazyFrame(data_other) + >>> lf_dask = dd.from_dict(data, npartitions=2) + >>> other_dask = dd.from_dict(data_other, npartitions=2) Let's define a dataframe-agnostic function in which we join over "ham" column: @@ -4920,16 +4833,16 @@ def join( ... ) -> IntoFrameT: ... df = nw.from_native(df_native) ... other = nw.from_native(other_native) - ... return df.join(other, left_on="ham", right_on="ham").to_native() - - We can now pass either pandas or Polars to the function: + ... return ( + ... df.join(other, left_on="ham", right_on="ham") + ... .sort("ham") + ... .collect() + ... .to_native() + ... ) - >>> agnostic_join_on_ham(df_pd, other_pd) - foo bar ham apple - 0 1 6.0 a x - 1 2 7.0 b y + We can then pass any supported library such as Polars or Dask to `agnostic_join_on_ham`: - >>> agnostic_join_on_ham(lf_pl, other_pl).collect() + >>> agnostic_join_on_ham(lf_pl, other_pl) shape: (2, 4) ┌─────┬─────┬─────┬───────┐ │ foo ┆ bar ┆ ham ┆ apple │ @@ -4939,6 +4852,10 @@ def join( │ 1 ┆ 6.0 ┆ a ┆ x │ │ 2 ┆ 7.0 ┆ b ┆ y │ └─────┴─────┴─────┴───────┘ + >>> agnostic_join_on_ham(lf_dask, other_dask) + foo bar ham apple + 0 1 6.0 a x + 0 2 7.0 b y """ return super().join( other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix @@ -4989,8 +4906,8 @@ def join_asof( Examples: >>> from datetime import datetime >>> import narwhals as nw - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> from typing import Literal >>> from narwhals.typing import IntoFrameT >>> @@ -5012,10 +4929,10 @@ def join_asof( ... ], ... "population": [82.19, 82.66, 83.12], ... } - >>> gdp_pd = pd.DataFrame(data_gdp) - >>> population_pd = pd.DataFrame(data_population) - >>> gdp_pl = pl.LazyFrame(data_gdp).sort("datetime") - >>> population_pl = pl.LazyFrame(data_population).sort("datetime") + >>> gdp_pl = pl.LazyFrame(data_gdp) + >>> population_pl = pl.LazyFrame(data_population) + >>> gdp_dask = dd.from_dict(data_gdp, npartitions=2) + >>> population_dask = dd.from_dict(data_population, npartitions=2) Let's define a dataframe-agnostic function in which we join over "datetime" column: @@ -5026,19 +4943,16 @@ def join_asof( ... ) -> IntoFrameT: ... df = nw.from_native(df_native) ... other = nw.from_native(other_native) - ... return df.join_asof(other, on="datetime", strategy=strategy).to_native() - - We can now pass either pandas or Polars to the function: + ... return ( + ... df.sort("datetime") + ... .join_asof(other, on="datetime", strategy=strategy) + ... .collect() + ... .to_native() + ... ) - >>> agnostic_join_asof_datetime(population_pd, gdp_pd, strategy="backward") - datetime population gdp - 0 2016-03-01 82.19 4164 - 1 2018-08-01 82.66 4566 - 2 2019-01-01 83.12 4696 + We can then pass any supported library such as Polars or Dask to `agnostic_join_asof_datetime`: - >>> agnostic_join_asof_datetime( - ... population_pl, gdp_pl, strategy="backward" - ... ).collect() + >>> agnostic_join_asof_datetime(population_pl, gdp_pl, strategy="backward") shape: (3, 3) ┌─────────────────────┬────────────┬──────┐ │ datetime ┆ population ┆ gdp │ @@ -5049,13 +4963,18 @@ def join_asof( │ 2018-08-01 00:00:00 ┆ 82.66 ┆ 4566 │ │ 2019-01-01 00:00:00 ┆ 83.12 ┆ 4696 │ └─────────────────────┴────────────┴──────┘ + >>> agnostic_join_asof_datetime(population_dask, gdp_dask, strategy="backward") + datetime population gdp + 0 2016-03-01 82.19 4164 + 1 2018-08-01 82.66 4566 + 0 2019-01-01 83.12 4696 Here is a real-world times-series example that uses `by` argument. >>> from datetime import datetime >>> import narwhals as nw - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> from narwhals.typing import IntoFrameT >>> >>> data_quotes = { @@ -5094,10 +5013,10 @@ def join_asof( ... "price": [51.95, 51.95, 720.77, 720.92, 98.0], ... "quantity": [75, 155, 100, 100, 100], ... } - >>> quotes_pd = pd.DataFrame(data_quotes) - >>> trades_pd = pd.DataFrame(data_trades) - >>> quotes_pl = pl.LazyFrame(data_quotes).sort("datetime") - >>> trades_pl = pl.LazyFrame(data_trades).sort("datetime") + >>> quotes_pl = pl.LazyFrame(data_quotes) + >>> trades_pl = pl.LazyFrame(data_trades) + >>> quotes_dask = dd.from_dict(data_quotes, npartitions=2) + >>> trades_dask = dd.from_dict(data_trades, npartitions=2) Let's define a dataframe-agnostic function in which we join over "datetime" and by "ticker" columns: @@ -5107,19 +5026,16 @@ def join_asof( ... ) -> IntoFrameT: ... df = nw.from_native(df_native) ... other = nw.from_native(other_native) - ... return df.join_asof(other, on="datetime", by="ticker").to_native() - - We can now pass either pandas or Polars to the function: + ... return ( + ... df.sort("datetime") + ... .join_asof(other, on="datetime", by="ticker") + ... .collect() + ... .to_native() + ... ) - >>> agnostic_join_asof_datetime_by_ticker(trades_pd, quotes_pd) - datetime ticker price quantity bid ask - 0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96 - 1 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98 - 2 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93 - 3 2016-05-25 13:30:00.000048 GOOG 720.92 100 720.50 720.93 - 4 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN + We can then pass any supported library such as Polars or Dask to `agnostic_join_asof_datetime_by_ticker`: - >>> agnostic_join_asof_datetime_by_ticker(trades_pl, quotes_pl).collect() + >>> agnostic_join_asof_datetime_by_ticker(trades_pl, quotes_pl) shape: (5, 6) ┌────────────────────────────┬────────┬────────┬──────────┬───────┬────────┐ │ datetime ┆ ticker ┆ price ┆ quantity ┆ bid ┆ ask │ @@ -5132,6 +5048,13 @@ def join_asof( │ 2016-05-25 13:30:00.000048 ┆ GOOG ┆ 720.92 ┆ 100 ┆ 720.5 ┆ 720.93 │ │ 2016-05-25 13:30:00.000048 ┆ AAPL ┆ 98.0 ┆ 100 ┆ null ┆ null │ └────────────────────────────┴────────┴────────┴──────────┴───────┴────────┘ + >>> agnostic_join_asof_datetime_by_ticker(trades_dask, quotes_dask) + datetime ticker price quantity bid ask + 0 2016-05-25 13:30:00.000023 MSFT 51.95 75 51.95 51.96 + 0 2016-05-25 13:30:00.000038 MSFT 51.95 155 51.97 51.98 + 1 2016-05-25 13:30:00.000048 GOOG 720.92 100 720.50 720.93 + 2 2016-05-25 13:30:00.000048 AAPL 98.00 100 NaN NaN + 3 2016-05-25 13:30:00.000048 GOOG 720.77 100 720.50 720.93 """ return super().join_asof( other, @@ -5152,26 +5075,21 @@ def clone(self) -> Self: Examples: >>> import narwhals as nw - >>> import pandas as pd >>> import polars as pl >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2], "b": [3, 4]} - >>> df_pd = pd.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) Let's define a dataframe-agnostic function in which we copy the DataFrame: >>> def agnostic_clone(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.clone().to_native() + ... return df.clone().collect().to_native() - >>> agnostic_clone(df_pd) - a b - 0 1 3 - 1 2 4 + We can then pass any supported library such as Polars to `agnostic_clone`: - >>> agnostic_clone(lf_pl).collect() + >>> agnostic_clone(lf_pl) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -5234,27 +5152,24 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: Examples: >>> import narwhals as nw - >>> import pandas as pd >>> import polars as pl + >>> import dask.dataframe as dd >>> from narwhals.typing import IntoFrameT >>> >>> data = {"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]} - >>> df_pd = pd.DataFrame(data) >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) - Let's define a dataframe-agnostic function in which gather every 2 rows, + Let's define a dataframe-agnostic function in which we gather every 2 rows, starting from a offset of 1: >>> def agnostic_gather_every(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) - ... return df.gather_every(n=2, offset=1).to_native() + ... return df.gather_every(n=2, offset=1).collect().to_native() - >>> agnostic_gather_every(df_pd) - a b - 1 2 6 - 3 4 8 + We can then pass any supported library such as Polars or Dask to `agnostic_gather_every`: - >>> agnostic_gather_every(lf_pl).collect() + >>> agnostic_gather_every(lf_pl) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -5264,6 +5179,10 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: │ 2 ┆ 6 │ │ 4 ┆ 8 │ └─────┴─────┘ + >>> agnostic_gather_every(lf_dask) + a b + 1 2 6 + 3 4 8 """ return super().gather_every(n=n, offset=offset) @@ -5302,6 +5221,7 @@ def unpivot( Examples: >>> import narwhals as nw >>> import polars as pl + >>> import dask.dataframe as dd >>> from narwhals.typing import IntoFrameT >>> >>> data = { @@ -5310,16 +5230,21 @@ def unpivot( ... "c": [2, 4, 6], ... } >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) We define a library agnostic function: >>> def agnostic_unpivot(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return ( - ... df.unpivot(on=["b", "c"], index="a").sort(["variable", "a"]) - ... ).to_native() + ... (df.unpivot(on=["b", "c"], index="a").sort(["variable", "a"])) + ... .collect() + ... .to_native() + ... ) + + We can then pass any supported library such as Polars or Dask to `agnostic_unpivot`: - >>> agnostic_unpivot(lf_pl).collect() + >>> agnostic_unpivot(lf_pl) shape: (6, 3) ┌─────┬──────────┬───────┐ │ a ┆ variable ┆ value │ @@ -5333,6 +5258,14 @@ def unpivot( │ y ┆ c ┆ 4 │ │ z ┆ c ┆ 6 │ └─────┴──────────┴───────┘ + >>> agnostic_unpivot(lf_dask) + a variable value + 0 x b 1 + 1 y b 3 + 0 z b 5 + 2 x c 2 + 3 y c 4 + 1 z c 6 """ return super().unpivot( on=on, index=index, variable_name=variable_name, value_name=value_name @@ -5342,7 +5275,7 @@ def explode(self: Self, columns: str | Sequence[str], *more_columns: str) -> Sel """Explode the dataframe to long format by exploding the given columns. Notes: - It is possible to explode multiple columns only if these columns must have + It is possible to explode multiple columns only if these columns have matching element counts. Arguments: @@ -5369,13 +5302,13 @@ def explode(self: Self, columns: str | Sequence[str], *more_columns: str) -> Sel ... nw.from_native(df_native) ... .with_columns(nw.col("lst1", "lst2").cast(nw.List(nw.Int32()))) ... .explode("lst1", "lst2") + ... .collect() ... .to_native() ... ) - We can then pass any supported library such as pandas, Polars (eager), - or PyArrow to `agnostic_explode`: + We can then pass any supported library such as Polars to `agnostic_explode`: - >>> agnostic_explode(pl.LazyFrame(data)).collect() + >>> agnostic_explode(pl.LazyFrame(data)) shape: (5, 3) ┌─────┬──────┬──────┐ │ a ┆ lst1 ┆ lst2 │ diff --git a/narwhals/stable/v1/__init__.py b/narwhals/stable/v1/__init__.py index 8bf4a4b1e6..ba51174252 100644 --- a/narwhals/stable/v1/__init__.py +++ b/narwhals/stable/v1/__init__.py @@ -422,13 +422,15 @@ def collect(self) -> DataFrame[Any]: Examples: >>> import narwhals as nw >>> import polars as pl - >>> lf_pl = pl.LazyFrame( - ... { - ... "a": ["a", "b", "a", "b", "b", "c"], - ... "b": [1, 2, 3, 4, 5, 6], - ... "c": [6, 5, 4, 3, 2, 1], - ... } - ... ) + >>> import dask.dataframe as dd + >>> data = { + ... "a": ["a", "b", "a", "b", "b", "c"], + ... "b": [1, 2, 3, 4, 5, 6], + ... "c": [6, 5, 4, 3, 2, 1], + ... } + >>> lf_pl = pl.LazyFrame(data) + >>> lf_dask = dd.from_dict(data, npartitions=2) + >>> lf = nw.from_native(lf_pl) >>> lf # doctest:+ELLIPSIS ┌─────────────────────────────┐ @@ -448,6 +450,27 @@ def collect(self) -> DataFrame[Any]: │ b ┆ 11 ┆ 10 │ │ c ┆ 6 ┆ 1 │ └─────┴─────┴─────┘ + + >>> lf = nw.from_native(lf_dask) + >>> lf + ┌───────────────────────────────────┐ + | Narwhals LazyFrame | + |-----------------------------------| + |Dask DataFrame Structure: | + | a b c| + |npartitions=2 | + |0 string int64 int64| + |3 ... ... ...| + |5 ... ... ...| + |Dask Name: frompandas, 1 expression| + |Expr=df | + └───────────────────────────────────┘ + >>> df = lf.group_by("a").agg(nw.col("b", "c").sum()).collect() + >>> df.to_native() + a b c + 0 a 4 10 + 1 b 11 10 + 2 c 6 1 """ return super().collect() # type: ignore[return-value]