From a3de5d065120a00b950a8867f60778a6a4798163 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 6 Nov 2024 21:22:45 +0400 Subject: [PATCH] feat: Support `cast` from Duration to String --- .../src/chunked_array/logical/duration.rs | 7 ++ crates/polars-core/src/fmt.rs | 106 +++++++++--------- .../tests/unit/datatypes/test_duration.py | 54 +++++++++ py-polars/tests/unit/interop/test_interop.py | 102 ++++++++--------- 4 files changed, 163 insertions(+), 106 deletions(-) diff --git a/crates/polars-core/src/chunked_array/logical/duration.rs b/crates/polars-core/src/chunked_array/logical/duration.rs index ca0347d87b5a4..e1489a6401522 100644 --- a/crates/polars-core/src/chunked_array/logical/duration.rs +++ b/crates/polars-core/src/chunked_array/logical/duration.rs @@ -1,4 +1,5 @@ use super::*; +use crate::fmt::fmt_duration_string; use crate::prelude::*; pub type DurationChunked = Logical; @@ -54,6 +55,12 @@ impl LogicalType for DurationChunked { }; Ok(out.into_duration(to_unit).into_series()) }, + String => { + let out: StringChunked = self.0.apply_nonnull_values_generic(String, |v: i64| { + fmt_duration_string(v, self.time_unit()) + }); + Ok(out.into()) + }, dt if dt.is_numeric() => self.0.cast_with_options(dtype, cast_options), dt => { polars_bail!( diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index 88fbfae967015..f60fadb9af785 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -980,63 +980,63 @@ const SIZES_US: [i64; 4] = [86_400_000_000, 3_600_000_000, 60_000_000, 1_000_000 const SIZES_MS: [i64; 4] = [86_400_000, 3_600_000, 60_000, 1_000]; #[cfg(feature = "dtype-duration")] -fn fmt_duration_ns(f: &mut Formatter<'_>, v: i64) -> fmt::Result { +pub fn fmt_duration_string(v: i64, unit: TimeUnit) -> String { if v == 0 { - return write!(f, "0ns"); - } - format_duration(f, v, SIZES_NS.as_slice(), NAMES.as_slice())?; - if v % 1000 != 0 { - write!(f, "{}ns", v % 1_000_000_000)?; - } else if v % 1_000_000 != 0 { - write!(f, "{}µs", (v % 1_000_000_000) / 1000)?; - } else if v % 1_000_000_000 != 0 { - write!(f, "{}ms", (v % 1_000_000_000) / 1_000_000)?; - } - Ok(()) -} - -#[cfg(feature = "dtype-duration")] -fn fmt_duration_us(f: &mut Formatter<'_>, v: i64) -> fmt::Result { - if v == 0 { - return write!(f, "0µs"); - } - format_duration(f, v, SIZES_US.as_slice(), NAMES.as_slice())?; - if v % 1000 != 0 { - write!(f, "{}µs", (v % 1_000_000))?; - } else if v % 1_000_000 != 0 { - write!(f, "{}ms", (v % 1_000_000) / 1_000)?; - } - Ok(()) -} - -#[cfg(feature = "dtype-duration")] -fn fmt_duration_ms(f: &mut Formatter<'_>, v: i64) -> fmt::Result { - if v == 0 { - return write!(f, "0ms"); - } - format_duration(f, v, SIZES_MS.as_slice(), NAMES.as_slice())?; - if v % 1_000 != 0 { - write!(f, "{}ms", (v % 1_000))?; + return match unit { + TimeUnit::Nanoseconds => "0ns".to_string(), + TimeUnit::Microseconds => "0µs".to_string(), + TimeUnit::Milliseconds => "0ms".to_string(), + }; } - Ok(()) -} - -#[cfg(feature = "dtype-duration")] -fn format_duration(f: &mut Formatter, v: i64, sizes: &[i64], names: &[&str]) -> fmt::Result { - for i in 0..4 { + let sizes = match unit { + TimeUnit::Nanoseconds => SIZES_NS.as_slice(), + TimeUnit::Microseconds => SIZES_US.as_slice(), + TimeUnit::Milliseconds => SIZES_MS.as_slice(), + }; + let mut s = String::with_capacity(32); + for (i, &size) in sizes.iter().enumerate() { let whole_num = if i == 0 { - v / sizes[i] + v / size } else { - (v % sizes[i - 1]) / sizes[i] + (v % sizes[i - 1]) / size }; - if whole_num <= -1 || whole_num >= 1 { - write!(f, "{}{}", whole_num, names[i])?; - if v % sizes[i] != 0 { - write!(f, " ")?; + if whole_num != 0 { + s.push_str(&format!("{}{}", whole_num, NAMES[i])); + if v % size != 0 { + s.push(' '); } } } - Ok(()) + match unit { + TimeUnit::Nanoseconds => { + let ns = v % 1_000_000_000; + if ns != 0 { + s.push_str(&format!("{}ns", ns)); + } else { + let us = ns / 1_000; + if us != 0 { + s.push_str(&format!("{}µs", us)); + } else { + s.push_str(&format!("{}ms", ns / 1_000_000)); + } + } + }, + TimeUnit::Microseconds => { + let us = v % 1_000_000; + if us != 0 { + s.push_str(&format!("{}µs", us)); + } else { + s.push_str(&format!("{}ms", us / 1_000)); + } + }, + TimeUnit::Milliseconds => { + let ms = v % 1_000; + if ms != 0 { + s.push_str(&format!("{}ms", ms)); + } + }, + } + s } fn format_blob(f: &mut Formatter<'_>, bytes: &[u8]) -> fmt::Result { @@ -1087,11 +1087,7 @@ impl Display for AnyValue<'_> { fmt_datetime(f, *v, *tu, tz.as_ref().map(|v| v.as_ref())) }, #[cfg(feature = "dtype-duration")] - AnyValue::Duration(v, tu) => match tu { - TimeUnit::Nanoseconds => fmt_duration_ns(f, *v), - TimeUnit::Microseconds => fmt_duration_us(f, *v), - TimeUnit::Milliseconds => fmt_duration_ms(f, *v), - }, + AnyValue::Duration(v, tu) => write!(f, "{}", fmt_duration_string(*v, *tu)), #[cfg(feature = "dtype-time")] AnyValue::Time(_) => { let nt: chrono::NaiveTime = self.into(); @@ -1221,7 +1217,7 @@ impl Series { #[inline] #[cfg(feature = "dtype-decimal")] -pub fn fmt_decimal(f: &mut Formatter<'_>, v: i128, scale: usize) -> fmt::Result { +fn fmt_decimal(f: &mut Formatter<'_>, v: i128, scale: usize) -> fmt::Result { use arrow::compute::decimal::format_decimal; let trim_zeros = get_trim_decimal_zeros(); diff --git a/py-polars/tests/unit/datatypes/test_duration.py b/py-polars/tests/unit/datatypes/test_duration.py index 597ac1c4a624e..b499b9d36251f 100644 --- a/py-polars/tests/unit/datatypes/test_duration.py +++ b/py-polars/tests/unit/datatypes/test_duration.py @@ -22,6 +22,60 @@ def test_duration_cum_sum() -> None: assert df.schema["A"].is_(duration_dtype) is False +def test_duration_cast() -> None: + durations = [ + timedelta(days=180, seconds=56789, microseconds=987654), + timedelta(days=0, seconds=64875, microseconds=8884), + timedelta(days=2, hours=23, seconds=4975, milliseconds=1), + timedelta(hours=1, seconds=1, milliseconds=1, microseconds=1), + timedelta(seconds=-42, milliseconds=-42), + None, + ] + + df = pl.DataFrame({"td": durations}, schema={"td": pl.Duration("us")}) + df_cast = df.select( + td_ms=pl.col("td").cast(pl.Duration("ms")), + td_str=pl.col("td").cast(pl.String), + td_int=pl.col("td").cast(pl.Int64), + ) + assert df_cast.schema == { + "td_ms": pl.Duration(time_unit="ms"), + "td_str": pl.Utf8, + "td_int": pl.Int64, + } + + expected = pl.DataFrame( + { + "td_ms": [ + timedelta(days=180, seconds=56789, milliseconds=987), + timedelta(days=0, seconds=64875, milliseconds=8), + timedelta(days=2, hours=23, seconds=4975, milliseconds=1), + timedelta(hours=1, seconds=1, milliseconds=1), + timedelta(seconds=-42, milliseconds=-42), + None, + ], + "td_str": [ + "180d 15h 46m 29s 987654µs", + "18h 1m 15s 8884µs", + "3d 22m 55s 1000µs", + "1h 1s 1001µs", + "-42s -42000µs", + None, + ], + "td_int": [ + 15608789987654, + 64875008884, + 260575001000, + 3601001001, + -42042000, + None, + ], + }, + schema_overrides={"td_ms": pl.Duration(time_unit="ms")}, + ) + assert_frame_equal(expected, df_cast) + + def test_duration_std_var() -> None: df = pl.DataFrame( {"duration": [1000, 5000, 3000]}, schema={"duration": pl.Duration} diff --git a/py-polars/tests/unit/interop/test_interop.py b/py-polars/tests/unit/interop/test_interop.py index b69a10671ca71..7ab6c196c8072 100644 --- a/py-polars/tests/unit/interop/test_interop.py +++ b/py-polars/tests/unit/interop/test_interop.py @@ -412,13 +412,13 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - ┌─────┬─────┬─────┬─────┬─────┬───────┐ - │ id ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ total │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i8 ┆ i16 ┆ i32 ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╪═════╪═════╪═══════╡ - └─────┴─────┴─────┴─────┴─────┴───────┘ - """ + ┌─────┬─────┬─────┬─────┬─────┬───────┐ + │ id ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ total │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i8 ┆ i16 ┆ i32 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╪═════╪═════╪═══════╡ + └─────┴─────┴─────┴─────┴─────┴───────┘ + """ ), ) assert df.shape == (0, 6) @@ -437,11 +437,11 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - ┌──────┬───────┐ - │ misc ┆ other │ - ╞══════╪═══════╡ - └──────┴───────┘ - """ + ┌──────┬───────┐ + │ misc ┆ other │ + ╞══════╪═══════╡ + └──────┴───────┘ + """ ), ) assert_frame_equal(df, pl.DataFrame(schema={"misc": pl.String, "other": pl.String})) @@ -472,17 +472,17 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - # >>> Missing cols with old-style ellipsis, nulls, commented out - # ┌────────────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬──────┐ - # │ dt ┆ c1 ┆ c2 ┆ c3 ┆ ... ┆ c96 ┆ c97 ┆ c98 ┆ c99 │ - # │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ - # │ date ┆ i32 ┆ i32 ┆ i32 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - # ╞════════════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪══════╡ - # │ 2023-03-25 ┆ 1 ┆ 2 ┆ 3 ┆ ... ┆ 96 ┆ 97 ┆ 98 ┆ 99 │ - # │ 1999-12-31 ┆ 3 ┆ 6 ┆ 9 ┆ ... ┆ 288 ┆ 291 ┆ 294 ┆ null │ - # │ null ┆ 9 ┆ 18 ┆ 27 ┆ ... ┆ 864 ┆ 873 ┆ 882 ┆ 891 │ - # └────────────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴──────┘ - """ + # >>> Missing cols with old-style ellipsis, nulls, commented out + # ┌────────────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬──────┐ + # │ dt ┆ c1 ┆ c2 ┆ c3 ┆ ... ┆ c96 ┆ c97 ┆ c98 ┆ c99 │ + # │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ + # │ date ┆ i32 ┆ i32 ┆ i32 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + # ╞════════════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪══════╡ + # │ 2023-03-25 ┆ 1 ┆ 2 ┆ 3 ┆ ... ┆ 96 ┆ 97 ┆ 98 ┆ 99 │ + # │ 1999-12-31 ┆ 3 ┆ 6 ┆ 9 ┆ ... ┆ 288 ┆ 291 ┆ 294 ┆ null │ + # │ null ┆ 9 ┆ 18 ┆ 27 ┆ ... ┆ 864 ┆ 873 ┆ 882 ┆ 891 │ + # └────────────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴──────┘ + """ ), ) assert df.schema == { @@ -505,15 +505,15 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - # >>> no dtypes: - # ┌────────────┬──────┐ - # │ dt ┆ c99 │ - # ╞════════════╪══════╡ - # │ 2023-03-25 ┆ 99 │ - # │ 1999-12-31 ┆ null │ - # │ null ┆ 891 │ - # └────────────┴──────┘ - """ + # >>> no dtypes: + # ┌────────────┬──────┐ + # │ dt ┆ c99 │ + # ╞════════════╪══════╡ + # │ 2023-03-25 ┆ 99 │ + # │ 1999-12-31 ┆ null │ + # │ null ┆ 891 │ + # └────────────┴──────┘ + """ ), ) assert df.schema == {"dt": pl.Date, "c99": pl.Int64} @@ -527,25 +527,25 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - In [2]: with pl.Config() as cfg: - ...: pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True) - ...: print(df) - ...: - shape: (1, 5) - ╭───────────┬────────────┬───┬───────┬────────────────────────────────╮ - │ source_ac ┆ source_cha ┆ … ┆ ident ┆ timestamp │ - │ tor_id ┆ nnel_id ┆ ┆ --- ┆ --- │ - │ --- ┆ --- ┆ ┆ str ┆ datetime[μs, Asia/Tokyo] │ - │ i32 ┆ i64 ┆ ┆ ┆ │ - ╞═══════════╪════════════╪═══╪═══════╪════════════════════════════════╡ - │ 123456780 ┆ 9876543210 ┆ … ┆ a:b:c ┆ 2023-03-25 10:56:59.663053 JST │ - ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ … ┆ … ┆ … ┆ … ┆ … │ - ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 803065983 ┆ 2055938745 ┆ … ┆ x:y:z ┆ 2023-03-25 12:38:18.050545 JST │ - ╰───────────┴────────────┴───┴───────┴────────────────────────────────╯ - # "Een fluitje van een cent..." :) - """ + In [2]: with pl.Config() as cfg: + ...: pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True) + ...: print(df) + ...: + shape: (1, 5) + ╭───────────┬────────────┬───┬───────┬────────────────────────────────╮ + │ source_ac ┆ source_cha ┆ … ┆ ident ┆ timestamp │ + │ tor_id ┆ nnel_id ┆ ┆ --- ┆ --- │ + │ --- ┆ --- ┆ ┆ str ┆ datetime[μs, Asia/Tokyo] │ + │ i32 ┆ i64 ┆ ┆ ┆ │ + ╞═══════════╪════════════╪═══╪═══════╪════════════════════════════════╡ + │ 123456780 ┆ 9876543210 ┆ … ┆ a:b:c ┆ 2023-03-25 10:56:59.663053 JST │ + ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ … ┆ … ┆ … ┆ … ┆ … │ + ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 803065983 ┆ 2055938745 ┆ … ┆ x:y:z ┆ 2023-03-25 12:38:18.050545 JST │ + ╰───────────┴────────────┴───┴───────┴────────────────────────────────╯ + # "Een fluitje van een cent..." :) + """ ), ) assert df.shape == (2, 4)