diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index a96c81812ba1..ce2050d6d9a2 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -344,7 +344,7 @@ impl DataFrame { /// let df1: DataFrame = df!("Name" => &["James", "Mary", "John", "Patricia"])?; /// assert_eq!(df1.shape(), (4, 1)); /// - /// let df2: DataFrame = df1.with_row_count("Id", None)?; + /// let df2: DataFrame = df1.with_row_index("Id", None)?; /// assert_eq!(df2.shape(), (4, 2)); /// println!("{}", df2); /// @@ -369,7 +369,7 @@ impl DataFrame { /// | 3 | Patricia | /// +-----+----------+ /// ``` - pub fn with_row_count(&self, name: &str, offset: Option) -> PolarsResult { + pub fn with_row_index(&self, name: &str, offset: Option) -> PolarsResult { let mut columns = Vec::with_capacity(self.columns.len() + 1); let offset = offset.unwrap_or(0); diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index 39a822d891f5..7e82bfdc3bab 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -1642,7 +1642,7 @@ impl LazyFrame { /// # Warning /// This can have a negative effect on query performance. This may for instance block /// predicate pushdown optimization. - pub fn with_row_count(mut self, name: &str, offset: Option) -> LazyFrame { + pub fn with_row_index(mut self, name: &str, offset: Option) -> LazyFrame { let add_row_count_in_map = match &mut self.logical_plan { LogicalPlan::Scan { file_options: options, diff --git a/crates/polars-lazy/src/scan/anonymous_scan.rs b/crates/polars-lazy/src/scan/anonymous_scan.rs index 2a26305eb84b..e6cea2a81aad 100644 --- a/crates/polars-lazy/src/scan/anonymous_scan.rs +++ b/crates/polars-lazy/src/scan/anonymous_scan.rs @@ -42,7 +42,7 @@ impl LazyFrame { .into(); if let Some(rc) = args.row_count { - lf = lf.with_row_count(&rc.name, Some(rc.offset)) + lf = lf.with_row_index(&rc.name, Some(rc.offset)) }; Ok(lf) diff --git a/crates/polars-lazy/src/scan/file_list_reader.rs b/crates/polars-lazy/src/scan/file_list_reader.rs index 18a1a62a29d3..524985d37d8f 100644 --- a/crates/polars-lazy/src/scan/file_list_reader.rs +++ b/crates/polars-lazy/src/scan/file_list_reader.rs @@ -61,7 +61,7 @@ pub trait LazyFileListReader: Clone { lf = lf.slice(0, n_rows as IdxSize) }; if let Some(rc) = self.row_count() { - lf = lf.with_row_count(&rc.name, Some(rc.offset)) + lf = lf.with_row_index(&rc.name, Some(rc.offset)) }; Ok(lf) diff --git a/crates/polars-lazy/src/scan/ipc.rs b/crates/polars-lazy/src/scan/ipc.rs index c5cba7838bbd..dacee16302c0 100644 --- a/crates/polars-lazy/src/scan/ipc.rs +++ b/crates/polars-lazy/src/scan/ipc.rs @@ -65,7 +65,7 @@ impl LazyFileListReader for LazyIpcReader { // it is a bit hacky, but this row_count function updates the schema if let Some(row_count) = args.row_count { - lf = lf.with_row_count(&row_count.name, Some(row_count.offset)) + lf = lf.with_row_index(&row_count.name, Some(row_count.offset)) } Ok(lf) diff --git a/crates/polars-lazy/src/scan/parquet.rs b/crates/polars-lazy/src/scan/parquet.rs index de3023d2f7c2..b03cab5f7210 100644 --- a/crates/polars-lazy/src/scan/parquet.rs +++ b/crates/polars-lazy/src/scan/parquet.rs @@ -88,9 +88,9 @@ impl LazyFileListReader for LazyParquetReader { .build() .into(); - // it is a bit hacky, but this row_count function updates the schema + // it is a bit hacky, but this row_index function updates the schema if let Some(row_count) = row_count { - lf = lf.with_row_count(&row_count.name, Some(row_count.offset)) + lf = lf.with_row_index(&row_count.name, Some(row_count.offset)) } lf.opt_state.file_caching = true; diff --git a/crates/polars-lazy/src/tests/io.rs b/crates/polars-lazy/src/tests/io.rs index 19577d7bda61..dcf93617535f 100644 --- a/crates/polars-lazy/src/tests/io.rs +++ b/crates/polars-lazy/src/tests/io.rs @@ -373,42 +373,42 @@ fn test_row_count_on_files() -> PolarsResult<()> { for offset in [0 as IdxSize, 10] { let lf = LazyCsvReader::new(FOODS_CSV) .with_row_count(Some(RowCount { - name: "rc".into(), + name: "index".into(), offset, })) .finish()?; assert!(row_count_at_scan(lf.clone())); let df = lf.collect()?; - let rc = df.column("rc")?; + let idx = df.column("index")?; assert_eq!( - rc.idx()?.into_no_null_iter().collect::>(), + idx.idx()?.into_no_null_iter().collect::>(), (offset..27 + offset).collect::>() ); let lf = LazyFrame::scan_parquet(FOODS_PARQUET, Default::default())? - .with_row_count("rc", Some(offset)); + .with_row_index("index", Some(offset)); assert!(row_count_at_scan(lf.clone())); let df = lf.collect()?; - let rc = df.column("rc")?; + let idx = df.column("index")?; assert_eq!( - rc.idx()?.into_no_null_iter().collect::>(), + idx.idx()?.into_no_null_iter().collect::>(), (offset..27 + offset).collect::>() ); - let lf = - LazyFrame::scan_ipc(FOODS_IPC, Default::default())?.with_row_count("rc", Some(offset)); + let lf = LazyFrame::scan_ipc(FOODS_IPC, Default::default())? + .with_row_index("index", Some(offset)); assert!(row_count_at_scan(lf.clone())); let df = lf.clone().collect()?; - let rc = df.column("rc")?; + let idx = df.column("index")?; assert_eq!( - rc.idx()?.into_no_null_iter().collect::>(), + idx.idx()?.into_no_null_iter().collect::>(), (offset..27 + offset).collect::>() ); let out = lf - .filter(col("rc").gt(lit(-1))) + .filter(col("index").gt(lit(-1))) .select([col("calories")]) .collect()?; assert!(out.column("calories").is_ok()); diff --git a/crates/polars-lazy/src/tests/optimization_checks.rs b/crates/polars-lazy/src/tests/optimization_checks.rs index cdc3e7e49e7e..9741d8ef3add 100644 --- a/crates/polars-lazy/src/tests/optimization_checks.rs +++ b/crates/polars-lazy/src/tests/optimization_checks.rs @@ -351,11 +351,11 @@ fn test_with_row_count_opts() -> PolarsResult<()> { let out = df .clone() .lazy() - .with_row_count("row_nr", None) + .with_row_index("index", None) .tail(5) .collect()?; let expected = df![ - "row_nr" => [5 as IdxSize, 6, 7, 8, 9], + "index" => [5 as IdxSize, 6, 7, 8, 9], "a" => [5, 6, 7, 8, 9], ]?; @@ -363,11 +363,11 @@ fn test_with_row_count_opts() -> PolarsResult<()> { let out = df .clone() .lazy() - .with_row_count("row_nr", None) + .with_row_index("index", None) .slice(1, 2) .collect()?; assert_eq!( - out.column("row_nr")? + out.column("index")? .idx()? .into_no_null_iter() .collect::>(), @@ -377,11 +377,11 @@ fn test_with_row_count_opts() -> PolarsResult<()> { let out = df .clone() .lazy() - .with_row_count("row_nr", None) + .with_row_index("index", None) .filter(col("a").eq(lit(3i32))) .collect()?; assert_eq!( - out.column("row_nr")? + out.column("index")? .idx()? .into_no_null_iter() .collect::>(), @@ -392,10 +392,10 @@ fn test_with_row_count_opts() -> PolarsResult<()> { .clone() .lazy() .slice(1, 2) - .with_row_count("row_nr", None) + .with_row_index("index", None) .collect()?; assert_eq!( - out.column("row_nr")? + out.column("index")? .idx()? .into_no_null_iter() .collect::>(), @@ -405,10 +405,10 @@ fn test_with_row_count_opts() -> PolarsResult<()> { let out = df .lazy() .filter(col("a").eq(lit(3i32))) - .with_row_count("row_nr", None) + .with_row_index("index", None) .collect()?; assert_eq!( - out.column("row_nr")? + out.column("index")? .idx()? .into_no_null_iter() .collect::>(), diff --git a/crates/polars-lazy/src/tests/projection_queries.rs b/crates/polars-lazy/src/tests/projection_queries.rs index 0ef064672713..37dc39f95939 100644 --- a/crates/polars-lazy/src/tests/projection_queries.rs +++ b/crates/polars-lazy/src/tests/projection_queries.rs @@ -65,7 +65,7 @@ fn test_cross_join_pd() -> PolarsResult<()> { } #[test] -fn test_row_count_pd() -> PolarsResult<()> { +fn test_row_number_pd() -> PolarsResult<()> { let df = df![ "x" => [1, 2, 3], "y" => [3, 2, 1], @@ -73,12 +73,12 @@ fn test_row_count_pd() -> PolarsResult<()> { let df = df .lazy() - .with_row_count("row_count", None) - .select([col("row_count"), col("x") * lit(3i32)]) + .with_row_index("index", None) + .select([col("index"), col("x") * lit(3i32)]) .collect()?; let expected = df![ - "row_count" => [0 as IdxSize, 1, 2], + "index" => [0 as IdxSize, 1, 2], "x" => [3i32, 6, 9] ]?; diff --git a/crates/polars-plan/src/logical_plan/functions/mod.rs b/crates/polars-plan/src/logical_plan/functions/mod.rs index 7517d2a1f627..c5a3390a638a 100644 --- a/crates/polars-plan/src/logical_plan/functions/mod.rs +++ b/crates/polars-plan/src/logical_plan/functions/mod.rs @@ -347,7 +347,7 @@ impl FunctionNode { let args = (**args).clone(); df.melt2(args) }, - RowCount { name, offset, .. } => df.with_row_count(name.as_ref(), *offset), + RowCount { name, offset, .. } => df.with_row_index(name.as_ref(), *offset), } } } diff --git a/crates/polars/tests/it/lazy/explodes.rs b/crates/polars/tests/it/lazy/explodes.rs index 01cc6ff69db7..b42e23edb3a2 100644 --- a/crates/polars/tests/it/lazy/explodes.rs +++ b/crates/polars/tests/it/lazy/explodes.rs @@ -10,9 +10,9 @@ fn test_explode_row_numbers() -> PolarsResult<()> { ]? .lazy() .select([col("text").str().split(lit(" ")).alias("tokens")]) - .with_row_count("row_nr", None) + .with_row_index("index", None) .explode([col("tokens")]) - .select([col("row_nr"), col("tokens")]) + .select([col("index"), col("tokens")]) .collect()?; assert_eq!(df.shape(), (8, 2)); diff --git a/crates/polars/tests/it/lazy/queries.rs b/crates/polars/tests/it/lazy/queries.rs index 90a576720a14..cb1ec55bf574 100644 --- a/crates/polars/tests/it/lazy/queries.rs +++ b/crates/polars/tests/it/lazy/queries.rs @@ -142,13 +142,13 @@ fn test_sorted_path() -> PolarsResult<()> { let out = df .lazy() - .with_row_count("row_nr", None) + .with_row_index("index", None) .explode(["a"]) - .group_by(["row_nr"]) + .group_by(["index"]) .agg([col("a").count().alias("count")]) .collect()?; - let s = out.column("row_nr")?; + let s = out.column("index")?; assert_eq!(s.is_sorted_flag(), IsSorted::Ascending); Ok(()) diff --git a/docs/src/python/user-guide/expressions/column-selections.py b/docs/src/python/user-guide/expressions/column-selections.py index 88951eaee831..52d210f6d66a 100644 --- a/docs/src/python/user-guide/expressions/column-selections.py +++ b/docs/src/python/user-guide/expressions/column-selections.py @@ -1,11 +1,10 @@ # --8<-- [start:setup] -import polars as pl - # --8<-- [end:setup] - # --8<-- [start:selectors_df] from datetime import date, datetime +import polars as pl + df = pl.DataFrame( { "id": [9, 4, 2], @@ -17,7 +16,7 @@ datetime(2022, 12, 1), datetime(2022, 12, 1, 0, 0, 2), "1s", eager=True ), } -).with_row_count("rn") +).with_row_index("index") print(df) # --8<-- [end:selectors_df] @@ -30,7 +29,7 @@ # --8<-- [end:all] # --8<-- [start:exclude] -out = df.select(pl.col("*").exclude("logged_at", "rn")) +out = df.select(pl.col("*").exclude("logged_at", "index")) print(out) # --8<-- [end:exclude] @@ -62,12 +61,12 @@ # --8<-- [end:selectors_diff] # --8<-- [start:selectors_union] -out = df.select(cs.by_name("rn") | ~cs.numeric()) +out = df.select(cs.by_name("index") | ~cs.numeric()) print(out) # --8<-- [end:selectors_union] # --8<-- [start:selectors_by_name] -out = df.select(cs.contains("rn"), cs.matches(".*_.*")) +out = df.select(cs.contains("index"), cs.matches(".*_.*")) print(out) # --8<-- [end:selectors_by_name] diff --git a/docs/src/rust/user-guide/expressions/column-selections.rs b/docs/src/rust/user-guide/expressions/column-selections.rs index d33ed96531f3..f3cacebd8c0c 100644 --- a/docs/src/rust/user-guide/expressions/column-selections.rs +++ b/docs/src/rust/user-guide/expressions/column-selections.rs @@ -16,7 +16,7 @@ fn main() -> Result<(), Box> { "logged_at" => date_range("logged_at", NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 2).unwrap(), Duration::parse("1s"),ClosedWindow::Both, TimeUnit::Milliseconds, None)?, )? - .with_row_count("rn", None)?; + .with_row_index("index", None)?; println!("{}", &df); // --8<-- [end:selectors_df] @@ -33,7 +33,7 @@ fn main() -> Result<(), Box> { let out = df .clone() .lazy() - .select([col("*").exclude(["logged_at", "rn"])]) + .select([col("*").exclude(["logged_at", "index"])]) .collect()?; println!("{}", &out); // --8<-- [end:exclude] diff --git a/py-polars/docs/source/reference/api.rst b/py-polars/docs/source/reference/api.rst index 26c708ea1fac..f8c04144cd15 100644 --- a/py-polars/docs/source/reference/api.rst +++ b/py-polars/docs/source/reference/api.rst @@ -84,7 +84,7 @@ Examples self._df = df def by_alternate_rows(self) -> list[pl.DataFrame]: - df = self._df.with_row_count(name="n") + df = self._df.with_row_index(name="n") return [ df.filter((pl.col("n") % 2) == 0).drop("n"), df.filter((pl.col("n") % 2) != 0).drop("n"), diff --git a/py-polars/docs/source/reference/dataframe/modify_select.rst b/py-polars/docs/source/reference/dataframe/modify_select.rst index b787cdb619a7..1a82e58027b0 100644 --- a/py-polars/docs/source/reference/dataframe/modify_select.rst +++ b/py-polars/docs/source/reference/dataframe/modify_select.rst @@ -80,3 +80,4 @@ Manipulation/selection DataFrame.with_columns DataFrame.with_columns_seq DataFrame.with_row_count + DataFrame.with_row_index diff --git a/py-polars/docs/source/reference/lazyframe/modify_select.rst b/py-polars/docs/source/reference/lazyframe/modify_select.rst index 1c55df34fb78..c71126c7093a 100644 --- a/py-polars/docs/source/reference/lazyframe/modify_select.rst +++ b/py-polars/docs/source/reference/lazyframe/modify_select.rst @@ -54,3 +54,4 @@ Manipulation/selection LazyFrame.with_columns_seq LazyFrame.with_context LazyFrame.with_row_count + LazyFrame.with_row_index diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 84634ddc3918..666d88e82214 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -5205,10 +5205,73 @@ def pipe( """ return function(self, *args, **kwargs) + def with_row_index(self, name: str = "index", offset: int = 0) -> Self: + """ + Add a row index as the first column in the DataFrame. + + Parameters + ---------- + name + Name of the index column. + offset + Start the index at this offset. Cannot be negative. + + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). + + Examples + -------- + >>> df = pl.DataFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> df.with_row_index() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + >>> df.with_row_index("id", offset=1000) + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ + """ + try: + return self._from_pydf(self._df.with_row_index(name, offset)) + except OverflowError: + issue = "negative" if offset < 0 else "greater than the maximum index value" + msg = f"`offset` input for `with_row_index` cannot be {issue}, got {offset}" + raise ValueError(msg) from None + + @deprecate_function( + "Use `with_row_index` instead." + "Note that the default column name has changed from 'row_nr' to 'index'.", + version="0.20.4", + ) def with_row_count(self, name: str = "row_nr", offset: int = 0) -> Self: """ Add a column at index 0 that counts the rows. + .. deprecated:: + Use `meth`:with_row_index` instead. + Note that the default column name has changed from 'row_nr' to 'index'. + Parameters ---------- name @@ -5224,7 +5287,7 @@ def with_row_count(self, name: str = "row_nr", offset: int = 0) -> Self: ... "b": [2, 4, 6], ... } ... ) - >>> df.with_row_count() + >>> df.with_row_count() # doctest: +SKIP shape: (3, 3) ┌────────┬─────┬─────┐ │ row_nr ┆ a ┆ b │ @@ -5236,7 +5299,7 @@ def with_row_count(self, name: str = "row_nr", offset: int = 0) -> Self: │ 2 ┆ 5 ┆ 6 │ └────────┴─────┴─────┘ """ - return self._from_pydf(self._df.with_row_count(name, offset)) + return self.with_row_index(name, offset) def group_by( self, diff --git a/py-polars/polars/expr/expr.py b/py-polars/polars/expr/expr.py index a8d0668680f9..4c2516c9ed83 100644 --- a/py-polars/polars/expr/expr.py +++ b/py-polars/polars/expr/expr.py @@ -5708,45 +5708,45 @@ def rolling_min( >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() + ... ).with_row_index() >>> df_temporal shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ >>> df_temporal.with_columns( - ... rolling_row_min=pl.col("row_nr").rolling_min( + ... rolling_row_min=pl.col("index").rolling_min( ... window_size="2h", by="date", closed="left" ... ) ... ) shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_min │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ - └────────┴─────────────────────┴─────────────────┘ + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_min │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22 │ + └───────┴─────────────────────┴─────────────────┘ """ window_size = deprecate_saturating(window_size) window_size, min_periods = _prepare_rolling_window_args( @@ -5916,72 +5916,72 @@ def rolling_max( >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() + ... ).with_row_index() >>> df_temporal shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ Compute the rolling max with the default left closure of temporal windows >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( + ... rolling_row_max=pl.col("index").rolling_max( ... window_size="2h", by="date", closed="left" ... ) ... ) shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ - └────────┴─────────────────────┴─────────────────┘ + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23 │ + └───────┴─────────────────────┴─────────────────┘ Compute the rolling max with the closure of windows on both sides >>> df_temporal.with_columns( - ... rolling_row_max=pl.col("row_nr").rolling_max( + ... rolling_row_max=pl.col("index").rolling_max( ... window_size="2h", by="date", closed="both" ... ) ... ) shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_max │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ - └────────┴─────────────────────┴─────────────────┘ + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_max │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 2 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 21 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 22 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 23 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 24 │ + └───────┴─────────────────────┴─────────────────┘ """ window_size = deprecate_saturating(window_size) window_size, min_periods = _prepare_rolling_window_args( @@ -6155,72 +6155,72 @@ def rolling_mean( >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() + ... ).with_row_index() >>> df_temporal shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ Compute the rolling mean with the default left closure of temporal windows >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... rolling_row_mean=pl.col("index").rolling_mean( ... window_size="2h", by="date", closed="left" ... ) ... ) shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ - └────────┴─────────────────────┴──────────────────┘ + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 19.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 20.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 21.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 22.5 │ + └───────┴─────────────────────┴──────────────────┘ Compute the rolling mean with the closure of windows on both sides >>> df_temporal.with_columns( - ... rolling_row_mean=pl.col("row_nr").rolling_mean( + ... rolling_row_mean=pl.col("index").rolling_mean( ... window_size="2h", by="date", closed="both" ... ) ... ) shape: (25, 3) - ┌────────┬─────────────────────┬──────────────────┐ - │ row_nr ┆ date ┆ rolling_row_mean │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪══════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ - └────────┴─────────────────────┴──────────────────┘ + ┌───────┬─────────────────────┬──────────────────┐ + │ index ┆ date ┆ rolling_row_mean │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪══════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 2.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 20.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 21.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 22.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 23.0 │ + └───────┴─────────────────────┴──────────────────┘ """ window_size = deprecate_saturating(window_size) window_size, min_periods = _prepare_rolling_window_args( @@ -6396,72 +6396,72 @@ def rolling_sum( >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() + ... ).with_row_index() >>> df_temporal shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ Compute the rolling sum with the default left closure of temporal windows >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... rolling_row_sum=pl.col("index").rolling_sum( ... window_size="2h", by="date", closed="left" ... ) ... ) shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ - └────────┴─────────────────────┴─────────────────┘ + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 3 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 39 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 41 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 43 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 45 │ + └───────┴─────────────────────┴─────────────────┘ Compute the rolling sum with the closure of windows on both sides >>> df_temporal.with_columns( - ... rolling_row_sum=pl.col("row_nr").rolling_sum( + ... rolling_row_sum=pl.col("index").rolling_sum( ... window_size="2h", by="date", closed="both" ... ) ... ) shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_sum │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ u32 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ - └────────┴─────────────────────┴─────────────────┘ + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_sum │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ u32 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 1 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 3 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 6 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 60 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 63 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 66 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 69 │ + └───────┴─────────────────────┴─────────────────┘ """ window_size = deprecate_saturating(window_size) window_size, min_periods = _prepare_rolling_window_args( @@ -6634,72 +6634,72 @@ def rolling_std( >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() + ... ).with_row_index() >>> df_temporal shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ Compute the rolling std with the default left closure of temporal windows >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( + ... rolling_row_std=pl.col("index").rolling_std( ... window_size="2h", by="date", closed="left" ... ) ... ) shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ - └────────┴─────────────────────┴─────────────────┘ + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.707107 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.707107 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.707107 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.707107 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.707107 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.707107 │ + └───────┴─────────────────────┴─────────────────┘ Compute the rolling std with the closure of windows on both sides >>> df_temporal.with_columns( - ... rolling_row_std=pl.col("row_nr").rolling_std( + ... rolling_row_std=pl.col("index").rolling_std( ... window_size="2h", by="date", closed="both" ... ) ... ) shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_std │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_std │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.707107 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ """ window_size = deprecate_saturating(window_size) window_size, min_periods = _prepare_rolling_window_args( @@ -6879,72 +6879,72 @@ def rolling_var( >>> stop = datetime(2001, 1, 2) >>> df_temporal = pl.DataFrame( ... {"date": pl.datetime_range(start, stop, "1h", eager=True)} - ... ).with_row_count() + ... ).with_row_index() >>> df_temporal shape: (25, 2) - ┌────────┬─────────────────────┐ - │ row_nr ┆ date │ - │ --- ┆ --- │ - │ u32 ┆ datetime[μs] │ - ╞════════╪═════════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 │ - │ 1 ┆ 2001-01-01 01:00:00 │ - │ 2 ┆ 2001-01-01 02:00:00 │ - │ 3 ┆ 2001-01-01 03:00:00 │ - │ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 │ - │ 22 ┆ 2001-01-01 22:00:00 │ - │ 23 ┆ 2001-01-01 23:00:00 │ - │ 24 ┆ 2001-01-02 00:00:00 │ - └────────┴─────────────────────┘ + ┌───────┬─────────────────────┐ + │ index ┆ date │ + │ --- ┆ --- │ + │ u32 ┆ datetime[μs] │ + ╞═══════╪═════════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 │ + │ 1 ┆ 2001-01-01 01:00:00 │ + │ 2 ┆ 2001-01-01 02:00:00 │ + │ 3 ┆ 2001-01-01 03:00:00 │ + │ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 │ + │ 22 ┆ 2001-01-01 22:00:00 │ + │ 23 ┆ 2001-01-01 23:00:00 │ + │ 24 ┆ 2001-01-02 00:00:00 │ + └───────┴─────────────────────┘ Compute the rolling var with the default left closure of temporal windows >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( + ... rolling_row_var=pl.col("index").rolling_var( ... window_size="2h", by="date", closed="left" ... ) ... ) shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ - └────────┴─────────────────────┴─────────────────┘ + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ null │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.0 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 0.5 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 0.5 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 0.5 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 0.5 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 0.5 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 0.5 │ + └───────┴─────────────────────┴─────────────────┘ Compute the rolling var with the closure of windows on both sides >>> df_temporal.with_columns( - ... rolling_row_var=pl.col("row_nr").rolling_var( + ... rolling_row_var=pl.col("index").rolling_var( ... window_size="2h", by="date", closed="both" ... ) ... ) shape: (25, 3) - ┌────────┬─────────────────────┬─────────────────┐ - │ row_nr ┆ date ┆ rolling_row_var │ - │ --- ┆ --- ┆ --- │ - │ u32 ┆ datetime[μs] ┆ f64 │ - ╞════════╪═════════════════════╪═════════════════╡ - │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ - │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ - │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ - │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ - │ … ┆ … ┆ … │ - │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ - │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ - │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ - │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ - └────────┴─────────────────────┴─────────────────┘ + ┌───────┬─────────────────────┬─────────────────┐ + │ index ┆ date ┆ rolling_row_var │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ datetime[μs] ┆ f64 │ + ╞═══════╪═════════════════════╪═════════════════╡ + │ 0 ┆ 2001-01-01 00:00:00 ┆ 0.0 │ + │ 1 ┆ 2001-01-01 01:00:00 ┆ 0.5 │ + │ 2 ┆ 2001-01-01 02:00:00 ┆ 1.0 │ + │ 3 ┆ 2001-01-01 03:00:00 ┆ 1.0 │ + │ … ┆ … ┆ … │ + │ 21 ┆ 2001-01-01 21:00:00 ┆ 1.0 │ + │ 22 ┆ 2001-01-01 22:00:00 ┆ 1.0 │ + │ 23 ┆ 2001-01-01 23:00:00 ┆ 1.0 │ + │ 24 ┆ 2001-01-02 00:00:00 ┆ 1.0 │ + └───────┴─────────────────────┴─────────────────┘ """ window_size = deprecate_saturating(window_size) window_size, min_periods = _prepare_rolling_window_args( diff --git a/py-polars/polars/io/ipc/functions.py b/py-polars/polars/io/ipc/functions.py index 2fc7332fbb74..eacb4c3956f1 100644 --- a/py-polars/polars/io/ipc/functions.py +++ b/py-polars/polars/io/ipc/functions.py @@ -93,7 +93,7 @@ def read_ipc( tbl = pa.feather.read_table(data, memory_map=memory_map, columns=columns) df = pl.DataFrame._from_arrow(tbl, rechunk=rechunk) if row_count_name is not None: - df = df.with_row_count(row_count_name, row_count_offset) + df = df.with_row_index(row_count_name, row_count_offset) if n_rows is not None: df = df.slice(0, n_rows) return df @@ -169,7 +169,7 @@ def read_ipc_stream( tbl = reader.read_all() df = pl.DataFrame._from_arrow(tbl, rechunk=rechunk) if row_count_name is not None: - df = df.with_row_count(row_count_name, row_count_offset) + df = df.with_row_index(row_count_name, row_count_offset) if n_rows is not None: df = df.slice(0, n_rows) return df diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index bc1de5bc3f47..d3ec947c4b7b 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -441,7 +441,7 @@ def _scan_parquet( if n_rows: scan = scan.head(n_rows) if row_count_name is not None: - scan = scan.with_row_count(row_count_name, row_count_offset) + scan = scan.with_row_index(row_count_name, row_count_offset) return scan # type: ignore[return-value] if storage_options: @@ -504,7 +504,7 @@ def _scan_ipc( if n_rows: scan = scan.head(n_rows) if row_count_name is not None: - scan = scan.with_row_count(row_count_name, row_count_offset) + scan = scan.with_row_index(row_count_name, row_count_offset) return scan # type: ignore[return-value] self = cls.__new__(cls) @@ -4563,10 +4563,78 @@ def approx_n_unique(self) -> Self: """ return self.select(F.all().approx_n_unique()) + def with_row_index(self, name: str = "index", offset: int = 0) -> Self: + """ + Add a row index as the first column in the LazyFrame. + + Parameters + ---------- + name + Name of the index column. + offset + Start the index at this offset. Cannot be negative. + + Warnings + -------- + Using this function can have a negative effect on query performance. + This may, for instance, block predicate pushdown optimization. + + Notes + ----- + The resulting column does not have any special properties. It is a regular + column of type `UInt32` (or `UInt64` in `polars-u64-idx`). + + Examples + -------- + >>> lf = pl.LazyFrame( + ... { + ... "a": [1, 3, 5], + ... "b": [2, 4, 6], + ... } + ... ) + >>> lf.with_row_index().collect() + shape: (3, 3) + ┌───────┬─────┬─────┐ + │ index ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞═══════╪═════╪═════╡ + │ 0 ┆ 1 ┆ 2 │ + │ 1 ┆ 3 ┆ 4 │ + │ 2 ┆ 5 ┆ 6 │ + └───────┴─────┴─────┘ + >>> lf.with_row_index("id", offset=1000).collect() + shape: (3, 3) + ┌──────┬─────┬─────┐ + │ id ┆ a ┆ b │ + │ --- ┆ --- ┆ --- │ + │ u32 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 1000 ┆ 1 ┆ 2 │ + │ 1001 ┆ 3 ┆ 4 │ + │ 1002 ┆ 5 ┆ 6 │ + └──────┴─────┴─────┘ + """ + try: + return self._from_pyldf(self._ldf.with_row_index(name, offset)) + except OverflowError: + issue = "negative" if offset < 0 else "greater than the maximum index value" + msg = f"`offset` input for `with_row_index` cannot be {issue}, got {offset}" + raise ValueError(msg) from None + + @deprecate_function( + "Use `with_row_index` instead." + "Note that the default column name has changed from 'row_nr' to 'index'.", + version="0.20.4", + ) def with_row_count(self, name: str = "row_nr", offset: int = 0) -> Self: """ Add a column at index 0 that counts the rows. + .. deprecated:: + Use `meth`:with_row_index` instead. + Note that the default column name has changed from 'row_nr' to 'index'. + Parameters ---------- name @@ -4587,7 +4655,7 @@ def with_row_count(self, name: str = "row_nr", offset: int = 0) -> Self: ... "b": [2, 4, 6], ... } ... ) - >>> lf.with_row_count().collect() + >>> lf.with_row_count().collect() # doctest: +SKIP shape: (3, 3) ┌────────┬─────┬─────┐ │ row_nr ┆ a ┆ b │ @@ -4599,7 +4667,7 @@ def with_row_count(self, name: str = "row_nr", offset: int = 0) -> Self: │ 2 ┆ 5 ┆ 6 │ └────────┴─────┴─────┘ """ - return self._from_pyldf(self._ldf.with_row_count(name, offset)) + return self.with_row_index(name, offset) def gather_every(self, n: int, offset: int = 0) -> Self: """ @@ -5784,8 +5852,8 @@ def update( # no keys provided--use row count row_count_used = True row_count_name = "__POLARS_ROW_COUNT" - self = self.with_row_count(row_count_name) - other = other.with_row_count(row_count_name) + self = self.with_row_index(row_count_name) + other = other.with_row_index(row_count_name) left_on = right_on = [row_count_name] else: # one of left or right is missing, raise error diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index d7cd5147e79e..ad59173bc782 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -4526,8 +4526,8 @@ def scatter( It is better to implement this as follows: - >>> s.to_frame().with_row_count("row_nr").select( - ... pl.when(pl.col("row_nr") == 1).then(10).otherwise(pl.col("a")) + >>> s.to_frame().with_row_index().select( + ... pl.when(pl.col("index") == 1).then(10).otherwise(pl.col("a")) ... ) shape: (3, 1) ┌─────────┐ diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 392160be27b0..e97ff9460f27 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -1136,10 +1136,10 @@ impl PyDataFrame { } } - pub fn with_row_count(&self, name: &str, offset: Option) -> PyResult { + pub fn with_row_index(&self, name: &str, offset: Option) -> PyResult { let df = self .df - .with_row_count(name, offset) + .with_row_index(name, offset) .map_err(PyPolarsErr::from)?; Ok(df.into()) } diff --git a/py-polars/src/lazyframe/mod.rs b/py-polars/src/lazyframe/mod.rs index de95d923bb4b..6d003d8fa202 100644 --- a/py-polars/src/lazyframe/mod.rs +++ b/py-polars/src/lazyframe/mod.rs @@ -986,9 +986,9 @@ impl PyLazyFrame { ldf.melt(args).into() } - fn with_row_count(&self, name: &str, offset: Option) -> Self { + fn with_row_index(&self, name: &str, offset: Option) -> Self { let ldf = self.ldf.clone(); - ldf.with_row_count(name, offset).into() + ldf.with_row_index(name, offset).into() } #[pyo3(signature = (lambda, predicate_pushdown, projection_pushdown, slice_pushdown, streamable, schema, validate_output))] diff --git a/py-polars/tests/unit/dataframe/test_df.py b/py-polars/tests/unit/dataframe/test_df.py index 5683494674a8..a09ac3512b98 100644 --- a/py-polars/tests/unit/dataframe/test_df.py +++ b/py-polars/tests/unit/dataframe/test_df.py @@ -1679,13 +1679,47 @@ def test_select_by_dtype(df: pl.DataFrame) -> None: } -def test_with_row_count() -> None: +def test_with_row_index() -> None: df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]}) - out = df.with_row_count() + out = df.with_row_index() + assert out["index"].to_list() == [0, 1, 2] + + out = df.lazy().with_row_index().collect() + assert out["index"].to_list() == [0, 1, 2] + + +def test_with_row_index_bad_offset() -> None: + df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]}) + + with pytest.raises(ValueError, match="cannot be negative"): + df.with_row_index(offset=-1) + with pytest.raises( + ValueError, match="cannot be greater than the maximum index value" + ): + df.with_row_index(offset=2**32) + + +def test_with_row_index_bad_offset_lazy() -> None: + lf = pl.LazyFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]}) + + with pytest.raises(ValueError, match="cannot be negative"): + lf.with_row_index(offset=-1) + with pytest.raises( + ValueError, match="cannot be greater than the maximum index value" + ): + lf.with_row_index(offset=2**32) + + +def test_with_row_count_deprecated() -> None: + df = pl.DataFrame({"a": [1, 1, 3], "b": [1.0, 2.0, 2.0]}) + + with pytest.deprecated_call(): + out = df.with_row_count() assert out["row_nr"].to_list() == [0, 1, 2] - out = df.lazy().with_row_count().collect() + with pytest.deprecated_call(): + out = df.lazy().with_row_count().collect() assert out["row_nr"].to_list() == [0, 1, 2] diff --git a/py-polars/tests/unit/io/test_lazy_csv.py b/py-polars/tests/unit/io/test_lazy_csv.py index f20b8ea9d24e..855b4eca538c 100644 --- a/py-polars/tests/unit/io/test_lazy_csv.py +++ b/py-polars/tests/unit/io/test_lazy_csv.py @@ -67,7 +67,7 @@ def test_row_count(foods_file_path: Path) -> None: df = ( pl.scan_csv(foods_file_path, row_count_name="row_count") - .with_row_count("foo", 10) + .with_row_index("foo", 10) .filter(pl.col("category") == pl.lit("vegetables")) .collect() ) @@ -195,13 +195,13 @@ def test_lazy_n_rows(foods_file_path: Path) -> None: def test_lazy_row_count_no_push_down(foods_file_path: Path) -> None: plan = ( pl.scan_csv(foods_file_path) - .with_row_count() - .filter(pl.col("row_nr") == 1) + .with_row_index() + .filter(pl.col("index") == 1) .filter(pl.col("category") == pl.lit("vegetables")) .explain(predicate_pushdown=True) ) # related to row count is not pushed. - assert 'FILTER [(col("row_nr")) == (1)] FROM' in plan + assert 'FILTER [(col("index")) == (1)] FROM' in plan # unrelated to row count is pushed. assert 'SELECTION: [(col("category")) == (String(vegetables))]' in plan @@ -283,5 +283,5 @@ def test_scan_empty_csv_with_row_count(tmp_path: Path) -> None: df = pl.DataFrame({"a": []}) df.write_csv(file_path) - read = pl.scan_csv(file_path).with_row_count("idx") + read = pl.scan_csv(file_path).with_row_index("idx") assert read.collect().schema == OrderedDict([("idx", pl.UInt32), ("a", pl.String)]) diff --git a/py-polars/tests/unit/io/test_lazy_ipc.py b/py-polars/tests/unit/io/test_lazy_ipc.py index e12b0658a292..05b10486b1c0 100644 --- a/py-polars/tests/unit/io/test_lazy_ipc.py +++ b/py-polars/tests/unit/io/test_lazy_ipc.py @@ -29,7 +29,7 @@ def test_row_count(foods_ipc_path: Path) -> None: df = ( pl.scan_ipc(foods_ipc_path, row_count_name="row_count") - .with_row_count("foo", 10) + .with_row_index("foo", 10) .filter(pl.col("category") == pl.lit("vegetables")) .collect() ) diff --git a/py-polars/tests/unit/io/test_lazy_json.py b/py-polars/tests/unit/io/test_lazy_json.py index 36d6ae4c49f5..7d9b1bedd642 100644 --- a/py-polars/tests/unit/io/test_lazy_json.py +++ b/py-polars/tests/unit/io/test_lazy_json.py @@ -30,7 +30,7 @@ def test_scan_ndjson(foods_ndjson_path: Path) -> None: df = ( pl.scan_ndjson(foods_ndjson_path, row_count_name="row_count") - .with_row_count("foo", 10) + .with_row_index("foo", 10) .filter(pl.col("category") == pl.lit("vegetables")) .collect() ) diff --git a/py-polars/tests/unit/io/test_lazy_parquet.py b/py-polars/tests/unit/io/test_lazy_parquet.py index ad2cf711b244..490e2389febd 100644 --- a/py-polars/tests/unit/io/test_lazy_parquet.py +++ b/py-polars/tests/unit/io/test_lazy_parquet.py @@ -50,7 +50,7 @@ def test_row_count(foods_parquet_path: Path) -> None: df = ( pl.scan_parquet(foods_parquet_path, row_count_name="row_count") - .with_row_count("foo", 10) + .with_row_index("foo", 10) .filter(pl.col("category") == pl.lit("vegetables")) .collect() ) @@ -407,7 +407,7 @@ def test_row_count_empty_file(tmp_path: Path) -> None: file_path = tmp_path / "test.parquet" df = pl.DataFrame({"a": []}, schema={"a": pl.Float32}) df.write_parquet(file_path) - result = pl.scan_parquet(file_path).with_row_count("idx").collect() + result = pl.scan_parquet(file_path).with_row_index("idx").collect() assert result.schema == OrderedDict([("idx", pl.UInt32), ("a", pl.Float32)]) diff --git a/py-polars/tests/unit/operations/rolling/test_rolling.py b/py-polars/tests/unit/operations/rolling/test_rolling.py index b72b9b7622a0..bad3d307c94e 100644 --- a/py-polars/tests/unit/operations/rolling/test_rolling.py +++ b/py-polars/tests/unit/operations/rolling/test_rolling.py @@ -275,11 +275,11 @@ def test_rolling_group_by_extrema() -> None: { "col1": pl.arange(0, 7, eager=True).reverse(), } - ).with_columns(pl.col("col1").reverse().alias("row_nr")) + ).with_columns(pl.col("col1").reverse().alias("index")) assert ( df.rolling( - index_column="row_nr", + index_column="index", period="3i", ) .agg( @@ -314,11 +314,11 @@ def test_rolling_group_by_extrema() -> None: { "col1": pl.arange(0, 7, eager=True), } - ).with_columns(pl.col("col1").alias("row_nr")) + ).with_columns(pl.col("col1").alias("index")) assert ( df.rolling( - index_column="row_nr", + index_column="index", period="3i", ) .agg( @@ -352,11 +352,11 @@ def test_rolling_group_by_extrema() -> None: { "col1": pl.arange(0, 7, eager=True).shuffle(1), } - ).with_columns(pl.col("col1").sort().alias("row_nr")) + ).with_columns(pl.col("col1").sort().alias("index")) assert ( df.rolling( - index_column="row_nr", + index_column="index", period="3i", ) .agg( @@ -629,12 +629,12 @@ def test_rolling_aggregations_with_over_11225() -> None: "date": [start + timedelta(days=k) for k in range(5)], "group": ["A"] * 2 + ["B"] * 3, } - ).with_row_count() + ).with_row_index() df_temporal = df_temporal.sort("group", "date") result = df_temporal.with_columns( - rolling_row_mean=pl.col("row_nr") + rolling_row_mean=pl.col("index") .rolling_mean( window_size="2d", by="date", @@ -645,12 +645,12 @@ def test_rolling_aggregations_with_over_11225() -> None: ) expected = pl.DataFrame( { - "row_nr": [0, 1, 2, 3, 4], + "index": [0, 1, 2, 3, 4], "date": pl.datetime_range(date(2001, 1, 1), date(2001, 1, 5), eager=True), "group": ["A", "A", "B", "B", "B"], "rolling_row_mean": [None, 0.0, None, 2.0, 2.5], }, - schema_overrides={"row_nr": pl.UInt32}, + schema_overrides={"index": pl.UInt32}, ) assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py index 086b5a1cf8d2..16521bd2b0c3 100644 --- a/py-polars/tests/unit/operations/test_explode.py +++ b/py-polars/tests/unit/operations/test_explode.py @@ -89,10 +89,10 @@ def test_explode_empty_list_4003() -> None: def test_explode_empty_list_4107() -> None: - df = pl.DataFrame({"b": [[1], [2], []] * 2}).with_row_count() + df = pl.DataFrame({"b": [[1], [2], []] * 2}).with_row_index() assert_frame_equal( - df.explode(["b"]), df.explode(["b"]).drop("row_nr").with_row_count() + df.explode(["b"]), df.explode(["b"]).drop("index").with_row_index() ) @@ -112,15 +112,15 @@ def test_explode_correct_for_slice() -> None: ) ) .sort("group") - .with_row_count() + .with_row_index() ) expected = pl.DataFrame( { - "row_nr": [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 5, 6, 6, 7, 8, 8, 8, 9], + "index": [0, 0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 5, 6, 6, 7, 8, 8, 8, 9], "group": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "b": [1, 2, 3, 2, 3, 4, 1, 2, 3, 0, 1, 2, 3, 2, 3, 4, 1, 2, 3, 0], }, - schema_overrides={"row_nr": pl.UInt32}, + schema_overrides={"index": pl.UInt32}, ) assert_frame_equal(df.slice(0, 10).explode(["b"]), expected) @@ -215,12 +215,12 @@ def test_explode_in_agg_context() -> None: ) assert ( - df.with_row_count("row_nr") + df.with_row_index() .explode("idxs") - .group_by("row_nr") + .group_by("index") .agg(pl.col("array").flatten()) ).to_dict(as_series=False) == { - "row_nr": [0, 1, 2], + "index": [0, 1, 2], "array": [[0.0, 3.5], [4.6, 0.0], [0.0, 7.8, 0.0, 0.0, 7.8, 0.0]], } @@ -281,7 +281,7 @@ def test_explode_invalid_element_count() -> None: "col1": [["X", "Y", "Z"], ["F", "G"], ["P"]], "col2": [["A", "B", "C"], ["C"], ["D", "E"]], } - ).with_row_count() + ).with_row_index() with pytest.raises( pl.ShapeError, match=r"exploded columns must have matching element counts" ): diff --git a/py-polars/tests/unit/operations/test_group_by.py b/py-polars/tests/unit/operations/test_group_by.py index c001e0e85457..226aac0855ac 100644 --- a/py-polars/tests/unit/operations/test_group_by.py +++ b/py-polars/tests/unit/operations/test_group_by.py @@ -462,17 +462,17 @@ def test_arg_sort_sort_by_groups_update__4360() -> None: def test_unique_order() -> None: - df = pl.DataFrame({"a": [1, 2, 1]}).with_row_count() + df = pl.DataFrame({"a": [1, 2, 1]}).with_row_index() assert df.unique(keep="last", subset="a", maintain_order=True).to_dict( as_series=False ) == { - "row_nr": [1, 2], + "index": [1, 2], "a": [2, 1], } assert df.unique(keep="first", subset="a", maintain_order=True).to_dict( as_series=False ) == { - "row_nr": [0, 1], + "index": [0, 1], "a": [1, 2], } diff --git a/py-polars/tests/unit/operations/test_join.py b/py-polars/tests/unit/operations/test_join.py index 275ae6034476..6fa7dddfc387 100644 --- a/py-polars/tests/unit/operations/test_join.py +++ b/py-polars/tests/unit/operations/test_join.py @@ -71,12 +71,12 @@ def test_join_same_cat_src() -> None: @pytest.mark.parametrize("reverse", [False, True]) def test_sorted_merge_joins(reverse: bool) -> None: n = 30 - df_a = pl.DataFrame({"a": np.sort(np.random.randint(0, n // 2, n))}).with_row_count( + df_a = pl.DataFrame({"a": np.sort(np.random.randint(0, n // 2, n))}).with_row_index( "row_a" ) df_b = pl.DataFrame( {"a": np.sort(np.random.randint(0, n // 2, n // 2))} - ).with_row_count("row_b") + ).with_row_index("row_b") if reverse: df_a = df_a.select(pl.all().reverse()) @@ -233,20 +233,20 @@ def test_joins_dispatch() -> None: def test_join_on_cast() -> None: df_a = ( pl.DataFrame({"a": [-5, -2, 3, 3, 9, 10]}) - .with_row_count() + .with_row_index() .with_columns(pl.col("a").cast(pl.Int32)) ) df_b = pl.DataFrame({"a": [-2, -3, 3, 10]}) assert df_a.join(df_b, on=pl.col("a").cast(pl.Int64)).to_dict(as_series=False) == { - "row_nr": [1, 2, 3, 5], + "index": [1, 2, 3, 5], "a": [-2, 3, 3, 10], } assert df_a.lazy().join( df_b.lazy(), on=pl.col("a").cast(pl.Int64) ).collect().to_dict(as_series=False) == { - "row_nr": [1, 2, 3, 5], + "index": [1, 2, 3, 5], "a": [-2, 3, 3, 10], } diff --git a/py-polars/tests/unit/operations/test_replace.py b/py-polars/tests/unit/operations/test_replace.py index 03ff873edd4e..fa7375626cc3 100644 --- a/py-polars/tests/unit/operations/test_replace.py +++ b/py-polars/tests/unit/operations/test_replace.py @@ -50,8 +50,8 @@ def test_replace_str_to_str_default_null(str_mapping: dict[str | None, str]) -> def test_replace_str_to_str_default_other(str_mapping: dict[str | None, str]) -> None: df = pl.DataFrame({"country_code": ["FR", None, "ES", "DE"]}) - result = df.with_row_count().select( - replaced=pl.col("country_code").replace(str_mapping, default=pl.col("row_nr")) + result = df.with_row_index().select( + replaced=pl.col("country_code").replace(str_mapping, default=pl.col("index")) ) expected = pl.DataFrame({"replaced": ["France", "Not specified", "2", "Germany"]}) assert_frame_equal(result, expected) diff --git a/py-polars/tests/unit/operations/test_rolling.py b/py-polars/tests/unit/operations/test_rolling.py index 138d319239b8..0432da7b194d 100644 --- a/py-polars/tests/unit/operations/test_rolling.py +++ b/py-polars/tests/unit/operations/test_rolling.py @@ -18,12 +18,9 @@ def test_rolling_group_by_overlapping_groups() -> None: assert_series_equal( ( - df.with_row_count() - .with_columns(pl.col("row_nr").cast(pl.Int32)) - .rolling( - index_column="row_nr", - period="5i", - ) + df.with_row_index() + .with_columns(pl.col("index").cast(pl.Int32)) + .rolling(index_column="index", period="5i") .agg( # trigger the apply on the expression engine pl.col("a").map_elements(lambda x: x).sum() diff --git a/py-polars/tests/unit/operations/test_sort.py b/py-polars/tests/unit/operations/test_sort.py index b211e9a29163..78facd848de8 100644 --- a/py-polars/tests/unit/operations/test_sort.py +++ b/py-polars/tests/unit/operations/test_sort.py @@ -119,14 +119,10 @@ def test_sort_nans_3740() -> None: def test_sort_by_exps_nulls_last() -> None: - df = pl.DataFrame( - { - "a": [1, 3, -2, None, 1], - } - ).with_row_count() + df = pl.DataFrame({"a": [1, 3, -2, None, 1]}).with_row_index() assert df.sort(pl.col("a") ** 2, nulls_last=True).to_dict(as_series=False) == { - "row_nr": [0, 4, 2, 1, 3], + "index": [0, 4, 2, 1, 3], "a": [1, 1, -2, 3, None], } @@ -183,7 +179,7 @@ def test_sorted_join_and_dtypes() -> None: for dt in [pl.Int8, pl.Int16, pl.Int32, pl.Int16]: df_a = ( pl.DataFrame({"a": [-5, -2, 3, 3, 9, 10]}) - .with_row_count() + .with_row_index() .with_columns(pl.col("a").cast(dt).set_sorted()) ) @@ -192,11 +188,11 @@ def test_sorted_join_and_dtypes() -> None: ) assert df_a.join(df_b, on="a", how="inner").to_dict(as_series=False) == { - "row_nr": [1, 2, 3, 5], + "index": [1, 2, 3, 5], "a": [-2, 3, 3, 10], } assert df_a.join(df_b, on="a", how="left").to_dict(as_series=False) == { - "row_nr": [0, 1, 2, 3, 4, 5], + "index": [0, 1, 2, 3, 4, 5], "a": [-5, -2, 3, 3, 9, 10], } @@ -399,7 +395,7 @@ def test_sorted_join_query_5406() -> None: } ) .with_columns(pl.col("Datetime").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S")) - .with_row_count("RowId") + .with_row_index("RowId") ) df1 = df.sort(by=["Datetime", "RowId"]) @@ -441,7 +437,7 @@ def test_merge_sorted() -> None: datetime(2022, 1, 1), datetime(2022, 12, 1), "1mo", eager=True ) .to_frame("range") - .with_row_count() + .with_row_index() ) df_b = ( @@ -449,13 +445,13 @@ def test_merge_sorted() -> None: datetime(2022, 1, 1), datetime(2022, 12, 1), "2mo", eager=True ) .to_frame("range") - .with_row_count() - .with_columns(pl.col("row_nr") * 10) + .with_row_index() + .with_columns(pl.col("index") * 10) ) out = df_a.merge_sorted(df_b, key="range") assert out["range"].is_sorted() assert out.to_dict(as_series=False) == { - "row_nr": [0, 0, 1, 2, 10, 3, 4, 20, 5, 6, 30, 7, 8, 40, 9, 10, 50, 11], + "index": [0, 0, 1, 2, 10, 3, 4, 20, 5, 6, 30, 7, 8, 40, 9, 10, 50, 11], "range": [ datetime(2022, 1, 1, 0, 0), datetime(2022, 1, 1, 0, 0), @@ -577,9 +573,9 @@ def test_limit_larger_than_sort() -> None: def test_sort_by_struct() -> None: - df = pl.Series([{"a": 300}, {"a": 20}, {"a": 55}]).to_frame("st").with_row_count() + df = pl.Series([{"a": 300}, {"a": 20}, {"a": 55}]).to_frame("st").with_row_index() assert df.sort("st").to_dict(as_series=False) == { - "row_nr": [1, 2, 0], + "index": [1, 2, 0], "st": [{"a": 20}, {"a": 55}, {"a": 300}], } diff --git a/py-polars/tests/unit/streaming/test_streaming_group_by.py b/py-polars/tests/unit/streaming/test_streaming_group_by.py index d3b76294f986..e4ad5117b5b9 100644 --- a/py-polars/tests/unit/streaming/test_streaming_group_by.py +++ b/py-polars/tests/unit/streaming/test_streaming_group_by.py @@ -169,7 +169,7 @@ def test_streaming_group_by_sorted_fast_path() -> None: # test on int8 as that also tests proper conversions "a": pl.Series(np.sort(a), dtype=pl.Int8) } - ).with_row_count() + ).with_row_index() df_sorted = df.with_columns(pl.col("a").set_sorted()) diff --git a/py-polars/tests/unit/test_cse.py b/py-polars/tests/unit/test_cse.py index 749d72b8f4d8..452c11460039 100644 --- a/py-polars/tests/unit/test_cse.py +++ b/py-polars/tests/unit/test_cse.py @@ -147,7 +147,7 @@ def test_schema_row_count_cse() -> None: ) csv_a.seek(0) - df_a = pl.scan_csv(csv_a.name).with_row_count("Idx") + df_a = pl.scan_csv(csv_a.name).with_row_index("Idx") result = ( df_a.join(df_a, on="B") diff --git a/py-polars/tests/unit/test_projections.py b/py-polars/tests/unit/test_projections.py index dc2f7908f67e..8e974f18ec18 100644 --- a/py-polars/tests/unit/test_projections.py +++ b/py-polars/tests/unit/test_projections.py @@ -23,13 +23,13 @@ def test_projection_on_semi_join_4789() -> None: def test_melt_projection_pd_block_4997() -> None: assert ( pl.DataFrame({"col1": ["a"], "col2": ["b"]}) - .with_row_count() + .with_row_index() .lazy() - .melt(id_vars="row_nr") - .group_by("row_nr") + .melt(id_vars="index") + .group_by("index") .agg(pl.col("variable").alias("result")) .collect() - ).to_dict(as_series=False) == {"row_nr": [0], "result": [["col1", "col2"]]} + ).to_dict(as_series=False) == {"index": [0], "result": [["col1", "col2"]]} def test_double_projection_pushdown() -> None: diff --git a/py-polars/tests/unit/test_selectors.py b/py-polars/tests/unit/test_selectors.py index 5d8d670c5611..417a16b6b66e 100644 --- a/py-polars/tests/unit/test_selectors.py +++ b/py-polars/tests/unit/test_selectors.py @@ -524,11 +524,11 @@ def test_selector_or() -> None: "float": [1.0, 2.0, 3.0], "str": ["x", "y", "z"], } - ).with_row_count("rn") + ).with_row_index("idx") - result = df.select(cs.by_name("rn") | ~cs.numeric()) + result = df.select(cs.by_name("idx") | ~cs.numeric()) expected = pl.DataFrame( - {"rn": [0, 1, 2], "str": ["x", "y", "z"]}, schema_overrides={"rn": pl.UInt32} + {"idx": [0, 1, 2], "str": ["x", "y", "z"]}, schema_overrides={"idx": pl.UInt32} ) assert_frame_equal(result, expected)