From 7f34cf66b645feaa1c25787f63b23d049cb09be1 Mon Sep 17 00:00:00 2001 From: Alexander Beedie Date: Wed, 6 Nov 2024 21:22:45 +0400 Subject: [PATCH] feat: Support use of Duration dtype in `to_string` --- Cargo.lock | 1 + crates/polars-core/Cargo.toml | 1 + .../src/chunked_array/temporal/conversion.rs | 38 ++++- .../src/chunked_array/temporal/date.rs | 1 + .../src/chunked_array/temporal/datetime.rs | 6 +- .../src/chunked_array/temporal/duration.rs | 19 +++ .../src/chunked_array/temporal/time.rs | 1 + crates/polars-core/src/fmt.rs | 149 +++++++++++------- crates/polars-core/src/frame/mod.rs | 2 +- .../src/series/implementations/datetime.rs | 12 +- crates/polars-time/src/series/mod.rs | 26 ++- py-polars/polars/dataframe/frame.py | 2 +- py-polars/polars/expr/datetime.py | 144 ++++++++++++----- py-polars/polars/series/datetime.py | 44 ++++-- .../tests/unit/datatypes/test_duration.py | 64 ++++++++ .../tests/unit/datatypes/test_temporal.py | 64 ++++++++ py-polars/tests/unit/interop/test_interop.py | 102 ++++++------ 17 files changed, 494 insertions(+), 182 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5176bd831139f..0c092a91b3656 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2828,6 +2828,7 @@ dependencies = [ "hashbrown 0.14.5", "hashbrown 0.15.0", "indexmap", + "itoa", "ndarray", "num-traits", "once_cell", diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index bb5cdc85cdac4..05723922e17e4 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -26,6 +26,7 @@ either = { workspace = true } hashbrown = { workspace = true } hashbrown_old_nightly_hack = { workspace = true } indexmap = { workspace = true } +itoa = { workspace = true } ndarray = { workspace = true, optional = true } num-traits = { workspace = true } once_cell = { workspace = true } diff --git a/crates/polars-core/src/chunked_array/temporal/conversion.rs b/crates/polars-core/src/chunked_array/temporal/conversion.rs index f54c17d4081e9..7fa4640a4469b 100644 --- a/crates/polars-core/src/chunked_array/temporal/conversion.rs +++ b/crates/polars-core/src/chunked_array/temporal/conversion.rs @@ -3,7 +3,9 @@ use chrono::*; use crate::prelude::*; -/// Number of seconds in a day +pub(crate) const NS_IN_DAY: i64 = 86_400_000_000_000; +pub(crate) const US_IN_DAY: i64 = 86_400_000_000; +pub(crate) const MS_IN_DAY: i64 = 86_400_000; pub(crate) const SECONDS_IN_DAY: i64 = 86_400; impl From<&AnyValue<'_>> for NaiveDateTime { @@ -37,12 +39,10 @@ pub fn datetime_to_timestamp_ns(v: NaiveDateTime) -> i64 { v.and_utc().timestamp_nanos_opt().unwrap() } -// Used by lazy for literal conversion pub fn datetime_to_timestamp_ms(v: NaiveDateTime) -> i64 { v.and_utc().timestamp_millis() } -// Used by lazy for literal conversion pub fn datetime_to_timestamp_us(v: NaiveDateTime) -> i64 { let us = v.and_utc().timestamp() * 1_000_000; us + v.and_utc().timestamp_subsec_micros() as i64 @@ -52,6 +52,32 @@ pub(crate) fn naive_datetime_to_date(v: NaiveDateTime) -> i32 { (datetime_to_timestamp_ms(v) / (MILLISECONDS * SECONDS_IN_DAY)) as i32 } -pub(crate) const NS_IN_DAY: i64 = 86_400_000_000_000; -pub(crate) const US_IN_DAY: i64 = 86_400_000_000; -pub(crate) const MS_IN_DAY: i64 = 86_400_000; +pub fn get_strftime_format(fmt: &str, dtype: &DataType) -> String { + if fmt != "iso" { + return fmt.to_string(); + } + #[allow(unreachable_code)] + let fmt: &str = match dtype { + #[cfg(feature = "dtype-datetime")] + DataType::Datetime(tu, tz) => match (tu, tz.is_some()) { + (TimeUnit::Milliseconds, true) => "%F %T%.3f%:z", + (TimeUnit::Milliseconds, false) => "%F %T%.3f", + (TimeUnit::Microseconds, true) => "%F %T%.6f%:z", + (TimeUnit::Microseconds, false) => "%F %T%.6f", + (TimeUnit::Nanoseconds, true) => "%F %T%.9f%:z", + (TimeUnit::Nanoseconds, false) => "%F %T%.9f", + }, + #[cfg(feature = "dtype-date")] + DataType::Date => "%F", + #[cfg(feature = "dtype-time")] + DataType::Time => "%T%.f", + _ => { + let err = format!( + "invalid call to `get_strftime_format`; fmt={:?}, dtype={}", + fmt, dtype + ); + unimplemented!("{}", err) + }, + }; + fmt.to_string() +} diff --git a/crates/polars-core/src/chunked_array/temporal/date.rs b/crates/polars-core/src/chunked_array/temporal/date.rs index ea0bb11d10fc4..8ec371e92b37c 100644 --- a/crates/polars-core/src/chunked_array/temporal/date.rs +++ b/crates/polars-core/src/chunked_array/temporal/date.rs @@ -33,6 +33,7 @@ impl DateChunked { /// Convert from Date into String with the given format. /// See [chrono strftime/strptime](https://docs.rs/chrono/0.4.19/chrono/format/strftime/index.html). pub fn to_string(&self, format: &str) -> PolarsResult { + let format = if format == "iso" { "%F" } else { format }; let datefmt_f = |ndt: NaiveDate| ndt.format(format); self.try_apply_into_string_amortized(|val, buf| { let ndt = date32_to_date(val); diff --git a/crates/polars-core/src/chunked_array/temporal/datetime.rs b/crates/polars-core/src/chunked_array/temporal/datetime.rs index 92439e5b75279..3f8c6390696d2 100644 --- a/crates/polars-core/src/chunked_array/temporal/datetime.rs +++ b/crates/polars-core/src/chunked_array/temporal/datetime.rs @@ -47,12 +47,12 @@ impl DatetimeChunked { TimeUnit::Microseconds => timestamp_us_to_datetime, TimeUnit::Milliseconds => timestamp_ms_to_datetime, }; - + let format = get_strftime_format(format, self.dtype()); let mut ca: StringChunked = match self.time_zone() { #[cfg(feature = "timezones")] Some(time_zone) => { let parsed_time_zone = time_zone.parse::().expect("already validated"); - let datefmt_f = |ndt| parsed_time_zone.from_utc_datetime(&ndt).format(format); + let datefmt_f = |ndt| parsed_time_zone.from_utc_datetime(&ndt).format(&format); self.try_apply_into_string_amortized(|val, buf| { let ndt = conversion_f(val); write!(buf, "{}", datefmt_f(ndt)) @@ -62,7 +62,7 @@ impl DatetimeChunked { )? }, _ => { - let datefmt_f = |ndt: NaiveDateTime| ndt.format(format); + let datefmt_f = |ndt: NaiveDateTime| ndt.format(&format); self.try_apply_into_string_amortized(|val, buf| { let ndt = conversion_f(val); write!(buf, "{}", datefmt_f(ndt)) diff --git a/crates/polars-core/src/chunked_array/temporal/duration.rs b/crates/polars-core/src/chunked_array/temporal/duration.rs index df8a51388bafc..d17eb9a9df1f0 100644 --- a/crates/polars-core/src/chunked_array/temporal/duration.rs +++ b/crates/polars-core/src/chunked_array/temporal/duration.rs @@ -1,4 +1,5 @@ use crate::export::chrono::Duration as ChronoDuration; +use crate::fmt::fmt_duration_string; use crate::prelude::DataType::Duration; use crate::prelude::*; @@ -60,6 +61,24 @@ impl DurationChunked { self.2 = Some(Duration(tu)) } + /// Convert from [`Duration`] to String; note that `strftime` format + /// strings are not supported, only the specifiers 'iso' and 'polars'. + pub fn to_string(&self, format: &str) -> PolarsResult { + match format { + "iso" | "polars" => { + let out: StringChunked = self + .0 + .apply_nonnull_values_generic(DataType::String, |v: i64| { + fmt_duration_string(v, self.time_unit(), format == "iso") + }); + Ok(out) + }, + _ => Err(PolarsError::InvalidOperation( + format!("format {:?} not supported for Duration type (expected one of 'iso' or 'polars')", format).into(), + )), + } + } + /// Construct a new [`DurationChunked`] from an iterator over [`ChronoDuration`]. pub fn from_duration>( name: PlSmallStr, diff --git a/crates/polars-core/src/chunked_array/temporal/time.rs b/crates/polars-core/src/chunked_array/temporal/time.rs index 77e204c765de7..7cc8a767e54ee 100644 --- a/crates/polars-core/src/chunked_array/temporal/time.rs +++ b/crates/polars-core/src/chunked_array/temporal/time.rs @@ -23,6 +23,7 @@ impl TimeChunked { pub fn to_string(&self, format: &str) -> StringChunked { let mut ca: StringChunked = self.apply_kernel_cast(&|arr| { let mut buf = String::new(); + let format = if format == "iso" { "%T%.9f" } else { format }; let mut mutarr = MutablePlString::with_capacity(arr.len()); for opt in arr.into_iter() { diff --git a/crates/polars-core/src/fmt.rs b/crates/polars-core/src/fmt.rs index 88fbfae967015..95c07e9191259 100644 --- a/crates/polars-core/src/fmt.rs +++ b/crates/polars-core/src/fmt.rs @@ -22,6 +22,8 @@ use comfy_table::modifiers::*; use comfy_table::presets::*; #[cfg(any(feature = "fmt", feature = "fmt_no_tty"))] use comfy_table::*; +#[cfg(feature = "dtype-duration")] +use itoa; use num_traits::{Num, NumCast}; use crate::config::*; @@ -966,7 +968,9 @@ fn fmt_datetime( } #[cfg(feature = "dtype-duration")] -const NAMES: [&str; 4] = ["d", "h", "m", "s"]; +const DURATION_PARTS: [&str; 4] = ["d", "h", "m", "s"]; +#[cfg(feature = "dtype-duration")] +const ISO_DURATION_PARTS: [&str; 4] = ["D", "H", "M", "S"]; #[cfg(feature = "dtype-duration")] const SIZES_NS: [i64; 4] = [ 86_400_000_000_000, @@ -980,63 +984,102 @@ const SIZES_US: [i64; 4] = [86_400_000_000, 3_600_000_000, 60_000_000, 1_000_000 const SIZES_MS: [i64; 4] = [86_400_000, 3_600_000, 60_000, 1_000]; #[cfg(feature = "dtype-duration")] -fn fmt_duration_ns(f: &mut Formatter<'_>, v: i64) -> fmt::Result { +pub fn fmt_duration_string(mut v: i64, unit: TimeUnit, iso: bool) -> String { if v == 0 { - return write!(f, "0ns"); - } - format_duration(f, v, SIZES_NS.as_slice(), NAMES.as_slice())?; - if v % 1000 != 0 { - write!(f, "{}ns", v % 1_000_000_000)?; - } else if v % 1_000_000 != 0 { - write!(f, "{}µs", (v % 1_000_000_000) / 1000)?; - } else if v % 1_000_000_000 != 0 { - write!(f, "{}ms", (v % 1_000_000_000) / 1_000_000)?; - } - Ok(()) -} - -#[cfg(feature = "dtype-duration")] -fn fmt_duration_us(f: &mut Formatter<'_>, v: i64) -> fmt::Result { - if v == 0 { - return write!(f, "0µs"); - } - format_duration(f, v, SIZES_US.as_slice(), NAMES.as_slice())?; - if v % 1000 != 0 { - write!(f, "{}µs", (v % 1_000_000))?; - } else if v % 1_000_000 != 0 { - write!(f, "{}ms", (v % 1_000_000) / 1_000)?; - } - Ok(()) -} + return if iso { + "PT0S".to_string() + } else { + match unit { + TimeUnit::Nanoseconds => "0ns".to_string(), + TimeUnit::Microseconds => "0µs".to_string(), + TimeUnit::Milliseconds => "0ms".to_string(), + } + }; + }; + let sizes = match unit { + TimeUnit::Nanoseconds => SIZES_NS.as_slice(), + TimeUnit::Microseconds => SIZES_US.as_slice(), + TimeUnit::Milliseconds => SIZES_MS.as_slice(), + }; -#[cfg(feature = "dtype-duration")] -fn fmt_duration_ms(f: &mut Formatter<'_>, v: i64) -> fmt::Result { - if v == 0 { - return write!(f, "0ms"); - } - format_duration(f, v, SIZES_MS.as_slice(), NAMES.as_slice())?; - if v % 1_000 != 0 { - write!(f, "{}ms", (v % 1_000))?; - } - Ok(()) -} + let mut s = String::with_capacity(32); + let mut buffer = itoa::Buffer::new(); + if iso { + if v < 0 { + s.push_str("-P"); + v = v.abs() + } else { + s.push('P'); + } + }; -#[cfg(feature = "dtype-duration")] -fn format_duration(f: &mut Formatter, v: i64, sizes: &[i64], names: &[&str]) -> fmt::Result { - for i in 0..4 { + for (i, &size) in sizes.iter().enumerate() { let whole_num = if i == 0 { - v / sizes[i] + v / size } else { - (v % sizes[i - 1]) / sizes[i] + (v % sizes[i - 1]) / size }; - if whole_num <= -1 || whole_num >= 1 { - write!(f, "{}{}", whole_num, names[i])?; - if v % sizes[i] != 0 { - write!(f, " ")?; + if whole_num != 0 || (iso && i == 3) { + s.push_str(buffer.format(whole_num)); + if iso { + if i == 3 { + let secs = match unit { + TimeUnit::Nanoseconds => format!(".{:09}", v % size), + TimeUnit::Microseconds => format!(".{:06}", v % size), + TimeUnit::Milliseconds => format!(".{:03}", v % size), + }; + s.push_str(secs.trim_end_matches('0')); + } + s.push_str(ISO_DURATION_PARTS[i]); + if i == 0 { + s.push('T'); + } + } else { + s.push_str(DURATION_PARTS[i]); + if v % size != 0 { + s.push(' '); + } } + } else if iso && i == 0 { + s.push('T'); } } - Ok(()) + if iso { + if s.ends_with('T') { + s.pop(); + } + } else { + match unit { + TimeUnit::Nanoseconds => { + if v % 1000 != 0 { + s.push_str(buffer.format(v % 1_000_000_000)); + s.push_str("ns"); + } else if v % 1_000_000 != 0 { + s.push_str(buffer.format((v % 1_000_000_000) / 1000)); + s.push_str("µs"); + } else if v % 1_000_000_000 != 0 { + s.push_str(buffer.format((v % 1_000_000_000) / 1_000_000)); + s.push_str("ms"); + } + }, + TimeUnit::Microseconds => { + if v % 1000 != 0 { + s.push_str(buffer.format(v % 1_000_000)); + s.push_str("µs"); + } else if v % 1_000_000 != 0 { + s.push_str(buffer.format((v % 1_000_000) / 1_000)); + s.push_str("ms"); + } + }, + TimeUnit::Milliseconds => { + if v % 1000 != 0 { + s.push_str(buffer.format(v % 1_000)); + s.push_str("ms"); + } + }, + } + } + s } fn format_blob(f: &mut Formatter<'_>, bytes: &[u8]) -> fmt::Result { @@ -1087,11 +1130,7 @@ impl Display for AnyValue<'_> { fmt_datetime(f, *v, *tu, tz.as_ref().map(|v| v.as_ref())) }, #[cfg(feature = "dtype-duration")] - AnyValue::Duration(v, tu) => match tu { - TimeUnit::Nanoseconds => fmt_duration_ns(f, *v), - TimeUnit::Microseconds => fmt_duration_us(f, *v), - TimeUnit::Milliseconds => fmt_duration_ms(f, *v), - }, + AnyValue::Duration(v, tu) => write!(f, "{}", fmt_duration_string(*v, *tu, false)), #[cfg(feature = "dtype-time")] AnyValue::Time(_) => { let nt: chrono::NaiveTime = self.into(); @@ -1221,7 +1260,7 @@ impl Series { #[inline] #[cfg(feature = "dtype-decimal")] -pub fn fmt_decimal(f: &mut Formatter<'_>, v: i128, scale: usize) -> fmt::Result { +fn fmt_decimal(f: &mut Formatter<'_>, v: i128, scale: usize) -> fmt::Result { use arrow::compute::decimal::format_decimal; let trim_zeros = get_trim_decimal_zeros(); diff --git a/crates/polars-core/src/frame/mod.rs b/crates/polars-core/src/frame/mod.rs index aa434fb07df73..d06eca11424d9 100644 --- a/crates/polars-core/src/frame/mod.rs +++ b/crates/polars-core/src/frame/mod.rs @@ -922,7 +922,7 @@ impl DataFrame { /// # Ok::<(), PolarsError>(()) /// ``` pub fn height(&self) -> usize { - self.shape().0 + self.height } /// Returns the size as number of rows * number of columns diff --git a/crates/polars-core/src/series/implementations/datetime.rs b/crates/polars-core/src/series/implementations/datetime.rs index ace52993b8a1e..7f0d575bd916a 100644 --- a/crates/polars-core/src/series/implementations/datetime.rs +++ b/crates/polars-core/src/series/implementations/datetime.rs @@ -254,16 +254,8 @@ impl SeriesTrait for SeriesWrap { } fn cast(&self, dtype: &DataType, cast_options: CastOptions) -> PolarsResult { - match (dtype, self.0.time_unit()) { - (DataType::String, TimeUnit::Milliseconds) => { - Ok(self.0.to_string("%F %T%.3f")?.into_series()) - }, - (DataType::String, TimeUnit::Microseconds) => { - Ok(self.0.to_string("%F %T%.6f")?.into_series()) - }, - (DataType::String, TimeUnit::Nanoseconds) => { - Ok(self.0.to_string("%F %T%.9f")?.into_series()) - }, + match dtype { + DataType::String => Ok(self.0.to_string("iso")?.into_series()), _ => self.0.cast_with_options(dtype, cast_options), } } diff --git a/crates/polars-time/src/series/mod.rs b/crates/polars-time/src/series/mod.rs index e9f728bf6e09f..5009e5beea2a1 100644 --- a/crates/polars-time/src/series/mod.rs +++ b/crates/polars-time/src/series/mod.rs @@ -256,14 +256,28 @@ pub trait TemporalMethods: AsSeries { fn to_string(&self, format: &str) -> PolarsResult { let s = self.as_series(); match s.dtype() { - #[cfg(feature = "dtype-date")] - DataType::Date => s.date().map(|ca| Ok(ca.to_string(format)?.into_series()))?, #[cfg(feature = "dtype-datetime")] - DataType::Datetime(_, _) => s - .datetime() - .map(|ca| Ok(ca.to_string(format)?.into_series()))?, + DataType::Datetime(_, _) => { + let format = get_strftime_format(format, s.dtype()); + s.datetime() + .map(|ca| Ok(ca.to_string(format.as_str())?.into_series()))? + }, + #[cfg(feature = "dtype-date")] + DataType::Date => { + let format = get_strftime_format(format, s.dtype()); + s.date() + .map(|ca| Ok(ca.to_string(format.as_str())?.into_series()))? + }, #[cfg(feature = "dtype-time")] - DataType::Time => s.time().map(|ca| ca.to_string(format).into_series()), + DataType::Time => { + let format = get_strftime_format(format, s.dtype()); + s.time() + .map(|ca| ca.to_string(format.as_str()).into_series()) + }, + #[cfg(feature = "dtype-duration")] + DataType::Duration(_) => s + .duration() + .map(|ca| Ok(ca.to_string(format)?.into_series()))?, dt => polars_bail!(opq = to_string, dt), } } diff --git a/py-polars/polars/dataframe/frame.py b/py-polars/polars/dataframe/frame.py index 4ff2752fdfb59..7fa7d5126af57 100644 --- a/py-polars/polars/dataframe/frame.py +++ b/py-polars/polars/dataframe/frame.py @@ -4548,7 +4548,7 @@ def insert_column(self, index: int, column: IntoExprColumn) -> DataFrame: Parameters ---------- index - Index at which to insert the new `Series` column. + Index at which to insert the new column. column `Series` or expression to insert. diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py index 4628ee2a9c159..3c72613c9a00c 100644 --- a/py-polars/polars/expr/datetime.py +++ b/py-polars/polars/expr/datetime.py @@ -450,13 +450,10 @@ def combine(self, time: dt.time | Expr, time_unit: TimeUnit = "us") -> Expr: time = parse_into_expression(time) return wrap_expr(self._pyexpr.dt_combine(time, time_unit)) - def to_string(self, format: str) -> Expr: + def to_string(self, format: str | None = None) -> Expr: """ Convert a Date/Time/Datetime column into a String column with the given format. - Similar to `cast(pl.String)`, but this method allows you to customize the - formatting of the resulting string. - Parameters ---------- format @@ -464,52 +461,123 @@ def to_string(self, format: str) -> Expr: `_ for specification. Example: `"%y-%m-%d"`. + Notes + ----- + * Similar to `cast(pl.String)`, but this method allows you to customize + the formatting of the resulting string; if no format is provided, the + appropriate ISO format for the underlying data type is used. + + * Duration dtype expressions cannot be formatted with `strftime`. Instead, + only "iso" and "polars" are supported as format strings. The "iso" format + string results in ISO8601 duration string output, and "polars" results + in the same form seen in the frame `repr`. + Examples -------- - >>> from datetime import datetime + >>> from datetime import datetime, date, timedelta, time >>> df = pl.DataFrame( ... { - ... "datetime": [ - ... datetime(2020, 3, 1), - ... datetime(2020, 4, 1), - ... datetime(2020, 5, 1), - ... ] + ... "dt": [ + ... date(1999, 3, 1), + ... date(2020, 5, 3), + ... date(2077, 7, 5), + ... ], + ... "dtm": [ + ... datetime(1980, 8, 10, 0, 10, 20), + ... datetime(2010, 10, 20, 8, 25, 35), + ... datetime(2040, 12, 30, 16, 40, 50), + ... ], + ... "tm": [ + ... time(1, 2, 3, 456789), + ... time(23, 59, 9, 101), + ... time(0, 0, 0, 100), + ... ], + ... "td": [ + ... timedelta(days=-1, seconds=-42), + ... timedelta(days=14, hours=-10, microseconds=1001), + ... timedelta(seconds=0), + ... ], ... } ... ) - >>> df.with_columns( - ... pl.col("datetime") - ... .dt.to_string("%Y/%m/%d %H:%M:%S") - ... .alias("datetime_string") + + Default format for temporal dtypes is ISO8601: + + >>> import polars.selectors as cs + >>> df.select((cs.date() | cs.datetime()).dt.to_string().name.prefix("s_")) + shape: (3, 2) + ┌────────────┬────────────────────────────┐ + │ s_dt ┆ s_dtm │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞════════════╪════════════════════════════╡ + │ 1999-03-01 ┆ 1980-08-10 00:10:20.000000 │ + │ 2020-05-03 ┆ 2010-10-20 08:25:35.000000 │ + │ 2077-07-05 ┆ 2040-12-30 16:40:50.000000 │ + └────────────┴────────────────────────────┘ + >>> df.select((cs.time() | cs.duration()).dt.to_string().name.prefix("s_")) + shape: (3, 2) + ┌─────────────────┬───────────────────┐ + │ s_tm ┆ s_td │ + │ --- ┆ --- │ + │ str ┆ str │ + ╞═════════════════╪═══════════════════╡ + │ 01:02:03.456789 ┆ -P1DT42.S │ + │ 23:59:09.000101 ┆ P13DT14H0.001001S │ + │ 00:00:00.000100 ┆ PT0S │ + └─────────────────┴───────────────────┘ + + All temporal types (aside from `Duration`) support strftime formatting: + + >>> df.select( + ... pl.col("dtm"), + ... s_dtm=pl.col("dtm").dt.to_string("%Y/%m/%d (%H.%M.%S)"), ... ) shape: (3, 2) - ┌─────────────────────┬─────────────────────┐ - │ datetime ┆ datetime_string │ - │ --- ┆ --- │ - │ datetime[μs] ┆ str │ - ╞═════════════════════╪═════════════════════╡ - │ 2020-03-01 00:00:00 ┆ 2020/03/01 00:00:00 │ - │ 2020-04-01 00:00:00 ┆ 2020/04/01 00:00:00 │ - │ 2020-05-01 00:00:00 ┆ 2020/05/01 00:00:00 │ - └─────────────────────┴─────────────────────┘ + ┌─────────────────────┬───────────────────────┐ + │ dtm ┆ s_dtm │ + │ --- ┆ --- │ + │ datetime[μs] ┆ str │ + ╞═════════════════════╪═══════════════════════╡ + │ 1980-08-10 00:10:20 ┆ 1980/08/10 (00.10.20) │ + │ 2010-10-20 08:25:35 ┆ 2010/10/20 (08.25.35) │ + │ 2040-12-30 16:40:50 ┆ 2040/12/30 (16.40.50) │ + └─────────────────────┴───────────────────────┘ - If you're interested in the day name / month name, you can use - `'%A'` / `'%B'`: + The Polars Duration string format (as seen in the frame repr) is also available: - >>> df.with_columns( - ... day_name=pl.col("datetime").dt.to_string("%A"), - ... month_name=pl.col("datetime").dt.to_string("%B"), + >>> df.select(pl.col("td"), s_td=pl.col("td").dt.to_string("polars")) + shape: (3, 2) + ┌────────────────┬────────────────┐ + │ td ┆ s_td │ + │ --- ┆ --- │ + │ duration[μs] ┆ str │ + ╞════════════════╪════════════════╡ + │ -1d -42s ┆ -1d -42s │ + │ 13d 14h 1001µs ┆ 13d 14h 1001µs │ + │ 0µs ┆ 0µs │ + └────────────────┴────────────────┘ + + If you're interested in extracting the day or month names, you can use + the `'%A'` and `'%B'` strftime specifiers: + + >>> df.select( + ... pl.col("dt"), + ... day_name=pl.col("dtm").dt.to_string("%A"), + ... month_name=pl.col("dtm").dt.to_string("%B"), ... ) shape: (3, 3) - ┌─────────────────────┬───────────┬────────────┐ - │ datetime ┆ day_name ┆ month_name │ - │ --- ┆ --- ┆ --- │ - │ datetime[μs] ┆ str ┆ str │ - ╞═════════════════════╪═══════════╪════════════╡ - │ 2020-03-01 00:00:00 ┆ Sunday ┆ March │ - │ 2020-04-01 00:00:00 ┆ Wednesday ┆ April │ - │ 2020-05-01 00:00:00 ┆ Friday ┆ May │ - └─────────────────────┴───────────┴────────────┘ - """ + ┌────────────┬───────────┬────────────┐ + │ dt ┆ day_name ┆ month_name │ + │ --- ┆ --- ┆ --- │ + │ date ┆ str ┆ str │ + ╞════════════╪═══════════╪════════════╡ + │ 1999-03-01 ┆ Sunday ┆ August │ + │ 2020-05-03 ┆ Wednesday ┆ October │ + │ 2077-07-05 ┆ Sunday ┆ December │ + └────────────┴───────────┴────────────┘ + """ + if format is None: + format = "iso" return wrap_expr(self._pyexpr.dt_to_string(format)) def strftime(self, format: str) -> Expr: diff --git a/py-polars/polars/series/datetime.py b/py-polars/polars/series/datetime.py index 3b0e905b84fcc..0ce6948aa8aae 100644 --- a/py-polars/polars/series/datetime.py +++ b/py-polars/polars/series/datetime.py @@ -211,9 +211,6 @@ def to_string(self, format: str) -> Series: """ Convert a Date/Time/Datetime column into a String column with the given format. - Similar to `cast(pl.String)`, but this method allows you to customize the - formatting of the resulting string. - Parameters ---------- format @@ -221,24 +218,49 @@ def to_string(self, format: str) -> Series: `_ for specification. Example: `"%y-%m-%d"`. + Notes + ----- + * Similar to `cast(pl.String)`, but this method allows you to customize + the formatting of the resulting string; if no format is provided, the + appropriate ISO format for the underlying data type is used. + + * Duration dtype Series cannot be formatted with `strftime`. Instead, + only "iso" and "polars" are supported as format strings. The "iso" format + string results in ISO8601 duration string output, and "polars" results + in the same form seen in the frame `repr`. + Examples -------- - >>> from datetime import datetime + >>> from datetime import date >>> s = pl.Series( ... "datetime", - ... [datetime(2020, 3, 1), datetime(2020, 4, 1), datetime(2020, 5, 1)], + ... [date(2020, 3, 1), date(2020, 4, 1), date(2020, 5, 1)], ... ) - >>> s.dt.to_string("%Y/%m/%d") + + Default for temporal dtypes (if not specifying a format string) is ISO8601: + + >>> s.dt.to_string() shape: (3,) Series: 'datetime' [str] [ - "2020/03/01" - "2020/04/01" - "2020/05/01" + "2020-03-01" + "2020-04-01" + "2020-05-01" ] - If you're interested in the day name / month name, you can use - `'%A'` / `'%B'`: + The output can be customized by using a strftime-compatible format string: + + >>> s.dt.to_string("%d/%m/%y") + shape: (3,) + Series: 'datetime' [str] + [ + "01/03/20" + "01/04/20" + "01/05/20" + ] + + If you're interested in using day or month names, you can use + the `'%A'` and/or `'%B'` format strings: >>> s.dt.to_string("%A") shape: (3,) diff --git a/py-polars/tests/unit/datatypes/test_duration.py b/py-polars/tests/unit/datatypes/test_duration.py index 597ac1c4a624e..f9df754466c99 100644 --- a/py-polars/tests/unit/datatypes/test_duration.py +++ b/py-polars/tests/unit/datatypes/test_duration.py @@ -22,6 +22,70 @@ def test_duration_cum_sum() -> None: assert df.schema["A"].is_(duration_dtype) is False +def test_duration_cast() -> None: + durations = [ + timedelta(days=180, seconds=56789, microseconds=987654), + timedelta(days=0, seconds=64875, microseconds=8884), + timedelta(days=2, hours=23, seconds=4975, milliseconds=1), + timedelta(hours=1, seconds=1, milliseconds=1, microseconds=1), + timedelta(seconds=-42, milliseconds=-42), + None, + ] + + df = pl.DataFrame({"td": durations}, schema={"td": pl.Duration("us")}) + df_cast = df.select( + td_ms=pl.col("td").cast(pl.Duration("ms")), + td_int=pl.col("td").cast(pl.Int64), + td_str_iso=pl.col("td").dt.to_string(), + td_str_pl=pl.col("td").dt.to_string("polars"), + ) + assert df_cast.schema == { + "td_ms": pl.Duration(time_unit="ms"), + "td_int": pl.Int64, + "td_str_iso": pl.String, + "td_str_pl": pl.String, + } + + expected = pl.DataFrame( + { + "td_ms": [ + timedelta(days=180, seconds=56789, milliseconds=987), + timedelta(days=0, seconds=64875, milliseconds=8), + timedelta(days=2, hours=23, seconds=4975, milliseconds=1), + timedelta(hours=1, seconds=1, milliseconds=1), + timedelta(seconds=-42, milliseconds=-42), + None, + ], + "td_int": [ + 15608789987654, + 64875008884, + 260575001000, + 3601001001, + -42042000, + None, + ], + "td_str_iso": [ + "P180DT15H46M29.987654S", + "PT18H1M15.008884S", + "P3DT22M55.001S", + "PT1H1.001001S", + "-PT42.042S", + None, + ], + "td_str_pl": [ + "180d 15h 46m 29s 987654µs", + "18h 1m 15s 8884µs", + "3d 22m 55s 1ms", + "1h 1s 1001µs", + "-42s -42ms", + None, + ], + }, + schema_overrides={"td_ms": pl.Duration(time_unit="ms")}, + ) + assert_frame_equal(expected, df_cast) + + def test_duration_std_var() -> None: df = pl.DataFrame( {"duration": [1000, 5000, 3000]}, schema={"duration": pl.Duration} diff --git a/py-polars/tests/unit/datatypes/test_temporal.py b/py-polars/tests/unit/datatypes/test_temporal.py index 042a0fca786b5..ec87a45a92064 100644 --- a/py-polars/tests/unit/datatypes/test_temporal.py +++ b/py-polars/tests/unit/datatypes/test_temporal.py @@ -12,6 +12,7 @@ from hypothesis import given import polars as pl +import polars.selectors as cs from polars.datatypes import DTYPE_TEMPORAL_UNITS from polars.exceptions import ( ComputeError, @@ -1094,6 +1095,69 @@ def test_datetime_string_casts() -> None: ] +def test_temporal_to_string_iso_default() -> None: + df = pl.DataFrame( + { + "td": [ + timedelta(days=-1, seconds=-42), + timedelta(days=14, hours=-10, microseconds=1001), + timedelta(seconds=0), + ], + "tm": [ + time(1, 2, 3, 456789), + time(23, 59, 9, 101), + time(0), + ], + "dt": [ + date(1999, 3, 1), + date(2020, 5, 3), + date(2077, 7, 5), + ], + "dtm": [ + datetime(1980, 8, 10, 0, 10, 20), + datetime(2010, 10, 20, 8, 25, 35), + datetime(2040, 12, 30, 16, 40, 50), + ], + } + ).with_columns(dtm_tz=pl.col("dtm").dt.replace_time_zone("Asia/Kathmandu")) + + df_stringified = df.select( + pl.col("td").dt.to_string("polars").alias("td_pl"), cs.temporal().dt.to_string() + ) + assert df_stringified.to_dict(as_series=False) == { + "td_pl": [ + "-1d -42s", + "13d 14h 1001µs", + "0µs", + ], + "td": [ + "-P1DT42.S", + "P13DT14H0.001001S", + "PT0S", + ], + "tm": [ + "01:02:03.456789", + "23:59:09.000101", + "00:00:00", + ], + "dt": [ + "1999-03-01", + "2020-05-03", + "2077-07-05", + ], + "dtm": [ + "1980-08-10 00:10:20.000000", + "2010-10-20 08:25:35.000000", + "2040-12-30 16:40:50.000000", + ], + "dtm_tz": [ + "1980-08-10 00:10:20.000000+05:30", + "2010-10-20 08:25:35.000000+05:45", + "2040-12-30 16:40:50.000000+05:45", + ], + } + + def test_iso_year() -> None: assert pl.Series([datetime(2022, 1, 1, 7, 8, 40)]).dt.iso_year()[0] == 2021 assert pl.Series([date(2022, 1, 1)]).dt.iso_year()[0] == 2021 diff --git a/py-polars/tests/unit/interop/test_interop.py b/py-polars/tests/unit/interop/test_interop.py index b69a10671ca71..7ab6c196c8072 100644 --- a/py-polars/tests/unit/interop/test_interop.py +++ b/py-polars/tests/unit/interop/test_interop.py @@ -412,13 +412,13 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - ┌─────┬─────┬─────┬─────┬─────┬───────┐ - │ id ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ total │ - │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ - │ str ┆ i8 ┆ i16 ┆ i32 ┆ i64 ┆ f64 │ - ╞═════╪═════╪═════╪═════╪═════╪═══════╡ - └─────┴─────┴─────┴─────┴─────┴───────┘ - """ + ┌─────┬─────┬─────┬─────┬─────┬───────┐ + │ id ┆ q1 ┆ q2 ┆ q3 ┆ q4 ┆ total │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ i8 ┆ i16 ┆ i32 ┆ i64 ┆ f64 │ + ╞═════╪═════╪═════╪═════╪═════╪═══════╡ + └─────┴─────┴─────┴─────┴─────┴───────┘ + """ ), ) assert df.shape == (0, 6) @@ -437,11 +437,11 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - ┌──────┬───────┐ - │ misc ┆ other │ - ╞══════╪═══════╡ - └──────┴───────┘ - """ + ┌──────┬───────┐ + │ misc ┆ other │ + ╞══════╪═══════╡ + └──────┴───────┘ + """ ), ) assert_frame_equal(df, pl.DataFrame(schema={"misc": pl.String, "other": pl.String})) @@ -472,17 +472,17 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - # >>> Missing cols with old-style ellipsis, nulls, commented out - # ┌────────────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬──────┐ - # │ dt ┆ c1 ┆ c2 ┆ c3 ┆ ... ┆ c96 ┆ c97 ┆ c98 ┆ c99 │ - # │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ - # │ date ┆ i32 ┆ i32 ┆ i32 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ - # ╞════════════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪══════╡ - # │ 2023-03-25 ┆ 1 ┆ 2 ┆ 3 ┆ ... ┆ 96 ┆ 97 ┆ 98 ┆ 99 │ - # │ 1999-12-31 ┆ 3 ┆ 6 ┆ 9 ┆ ... ┆ 288 ┆ 291 ┆ 294 ┆ null │ - # │ null ┆ 9 ┆ 18 ┆ 27 ┆ ... ┆ 864 ┆ 873 ┆ 882 ┆ 891 │ - # └────────────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴──────┘ - """ + # >>> Missing cols with old-style ellipsis, nulls, commented out + # ┌────────────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬──────┐ + # │ dt ┆ c1 ┆ c2 ┆ c3 ┆ ... ┆ c96 ┆ c97 ┆ c98 ┆ c99 │ + # │ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │ + # │ date ┆ i32 ┆ i32 ┆ i32 ┆ ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + # ╞════════════╪═════╪═════╪═════╪═════╪═════╪═════╪═════╪══════╡ + # │ 2023-03-25 ┆ 1 ┆ 2 ┆ 3 ┆ ... ┆ 96 ┆ 97 ┆ 98 ┆ 99 │ + # │ 1999-12-31 ┆ 3 ┆ 6 ┆ 9 ┆ ... ┆ 288 ┆ 291 ┆ 294 ┆ null │ + # │ null ┆ 9 ┆ 18 ┆ 27 ┆ ... ┆ 864 ┆ 873 ┆ 882 ┆ 891 │ + # └────────────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴──────┘ + """ ), ) assert df.schema == { @@ -505,15 +505,15 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - # >>> no dtypes: - # ┌────────────┬──────┐ - # │ dt ┆ c99 │ - # ╞════════════╪══════╡ - # │ 2023-03-25 ┆ 99 │ - # │ 1999-12-31 ┆ null │ - # │ null ┆ 891 │ - # └────────────┴──────┘ - """ + # >>> no dtypes: + # ┌────────────┬──────┐ + # │ dt ┆ c99 │ + # ╞════════════╪══════╡ + # │ 2023-03-25 ┆ 99 │ + # │ 1999-12-31 ┆ null │ + # │ null ┆ 891 │ + # └────────────┴──────┘ + """ ), ) assert df.schema == {"dt": pl.Date, "c99": pl.Int64} @@ -527,25 +527,25 @@ def test_dataframe_from_repr() -> None: pl.DataFrame, pl.from_repr( """ - In [2]: with pl.Config() as cfg: - ...: pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True) - ...: print(df) - ...: - shape: (1, 5) - ╭───────────┬────────────┬───┬───────┬────────────────────────────────╮ - │ source_ac ┆ source_cha ┆ … ┆ ident ┆ timestamp │ - │ tor_id ┆ nnel_id ┆ ┆ --- ┆ --- │ - │ --- ┆ --- ┆ ┆ str ┆ datetime[μs, Asia/Tokyo] │ - │ i32 ┆ i64 ┆ ┆ ┆ │ - ╞═══════════╪════════════╪═══╪═══════╪════════════════════════════════╡ - │ 123456780 ┆ 9876543210 ┆ … ┆ a:b:c ┆ 2023-03-25 10:56:59.663053 JST │ - ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ … ┆ … ┆ … ┆ … ┆ … │ - ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ - │ 803065983 ┆ 2055938745 ┆ … ┆ x:y:z ┆ 2023-03-25 12:38:18.050545 JST │ - ╰───────────┴────────────┴───┴───────┴────────────────────────────────╯ - # "Een fluitje van een cent..." :) - """ + In [2]: with pl.Config() as cfg: + ...: pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True) + ...: print(df) + ...: + shape: (1, 5) + ╭───────────┬────────────┬───┬───────┬────────────────────────────────╮ + │ source_ac ┆ source_cha ┆ … ┆ ident ┆ timestamp │ + │ tor_id ┆ nnel_id ┆ ┆ --- ┆ --- │ + │ --- ┆ --- ┆ ┆ str ┆ datetime[μs, Asia/Tokyo] │ + │ i32 ┆ i64 ┆ ┆ ┆ │ + ╞═══════════╪════════════╪═══╪═══════╪════════════════════════════════╡ + │ 123456780 ┆ 9876543210 ┆ … ┆ a:b:c ┆ 2023-03-25 10:56:59.663053 JST │ + ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ … ┆ … ┆ … ┆ … ┆ … │ + ├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤ + │ 803065983 ┆ 2055938745 ┆ … ┆ x:y:z ┆ 2023-03-25 12:38:18.050545 JST │ + ╰───────────┴────────────┴───┴───────┴────────────────────────────────╯ + # "Een fluitje van een cent..." :) + """ ), ) assert df.shape == (2, 4)