diff --git a/datafusion/functions/src/datetime/common.rs b/datafusion/functions/src/datetime/common.rs index 6048eeeaa554..89b40a3534d3 100644 --- a/datafusion/functions/src/datetime/common.rs +++ b/datafusion/functions/src/datetime/common.rs @@ -153,6 +153,31 @@ pub(crate) fn string_to_timestamp_nanos_formatted( }) } +/// Accepts a string with a `chrono` format and converts it to a +/// millisecond precision timestamp. +/// +/// See [`chrono::format::strftime`] for the full set of supported formats. +/// +/// Internally, this function uses the `chrono` library for the +/// datetime parsing +/// +/// ## Timezone / Offset Handling +/// +/// Numerical values of timestamps are stored compared to offset UTC. +/// +/// Any timestamp in the formatting string is handled according to the rules +/// defined by `chrono`. +/// +/// [`chrono::format::strftime`]: https://docs.rs/chrono/latest/chrono/format/strftime/index.html +/// +#[inline] +pub(crate) fn string_to_timestamp_millis_formatted(s: &str, format: &str) -> Result { + Ok(string_to_datetime_formatted(&Utc, s, format)? + .naive_utc() + .and_utc() + .timestamp_millis()) +} + pub(crate) fn handle<'a, O, F, S>( args: &'a [ColumnarValue], op: F, diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index cc5ffa73c8f1..288641b84dd7 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -17,12 +17,14 @@ use std::any::Any; -use arrow::array::types::Date32Type; use arrow::datatypes::DataType; use arrow::datatypes::DataType::Date32; +use arrow::error::ArrowError::ParseError; +use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser}; use crate::datetime::common::*; -use datafusion_common::{exec_err, internal_datafusion_err, Result}; +use datafusion_common::error::DataFusionError; +use datafusion_common::{arrow_err, exec_err, internal_datafusion_err, Result}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] @@ -47,22 +49,20 @@ impl ToDateFunc { match args.len() { 1 => handle::( args, - |s| { - string_to_timestamp_nanos_shim(s) - .map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000)) - .and_then(|v| { - v.try_into().map_err(|_| { - internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed") - }) - }) + |s| match Date32Type::parse(s) { + Some(v) => Ok(v), + None => arrow_err!(ParseError( + "Unable to cast to Date32 for converting from i64 to i32 failed" + .to_string() + )), }, "to_date", ), 2.. => handle_multiple::( args, |s, format| { - string_to_timestamp_nanos_formatted(s, format) - .map(|n| n / (1_000_000 * 24 * 60 * 60 * 1_000)) + string_to_timestamp_millis_formatted(s, format) + .map(|n| n / (24 * 60 * 60 * 1_000)) .and_then(|v| { v.try_into().map_err(|_| { internal_datafusion_err!("Unable to cast to Date32 for converting from i64 to i32 failed") @@ -118,3 +118,212 @@ impl ScalarUDFImpl for ToDateFunc { } } } + +#[cfg(test)] +mod tests { + use arrow::{compute::kernels::cast_utils::Parser, datatypes::Date32Type}; + use datafusion_common::ScalarValue; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + use super::ToDateFunc; + + #[test] + fn test_to_date_without_format() { + struct TestCase { + name: &'static str, + date_str: &'static str, + } + + let test_cases = vec![ + TestCase { + name: "Largest four-digit year (9999)", + date_str: "9999-12-31", + }, + TestCase { + name: "Year 1 (0001)", + date_str: "0001-12-31", + }, + TestCase { + name: "Year before epoch (1969)", + date_str: "1969-01-01", + }, + TestCase { + name: "Switch Julian/Gregorian calendar (1582-10-10)", + date_str: "1582-10-10", + }, + ]; + + for tc in &test_cases { + let date_scalar = ScalarValue::Utf8(Some(tc.date_str.to_string())); + let to_date_result = + ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]); + + match to_date_result { + Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => { + let expected = Date32Type::parse_formatted(tc.date_str, "%Y-%m-%d"); + assert_eq!( + date_val, expected, + "{}: to_date created wrong value", + tc.name + ); + } + _ => panic!("Could not convert '{}' to Date", tc.date_str), + } + } + } + + #[test] + fn test_to_date_with_format() { + struct TestCase { + name: &'static str, + date_str: &'static str, + format_str: &'static str, + formatted_date: &'static str, + } + + let test_cases = vec![ + TestCase { + name: "Largest four-digit year (9999)", + date_str: "9999-12-31", + format_str: "%Y%m%d", + formatted_date: "99991231", + }, + TestCase { + name: "Smallest four-digit year (-9999)", + date_str: "-9999-12-31", + format_str: "%Y/%m/%d", + formatted_date: "-9999/12/31", + }, + TestCase { + name: "Year 1 (0001)", + date_str: "0001-12-31", + format_str: "%Y%m%d", + formatted_date: "00011231", + }, + TestCase { + name: "Year before epoch (1969)", + date_str: "1969-01-01", + format_str: "%Y%m%d", + formatted_date: "19690101", + }, + TestCase { + name: "Switch Julian/Gregorian calendar (1582-10-10)", + date_str: "1582-10-10", + format_str: "%Y%m%d", + formatted_date: "15821010", + }, + TestCase { + name: "Negative Year, BC (-42-01-01)", + date_str: "-42-01-01", + format_str: "%Y/%m/%d", + formatted_date: "-42/01/01", + }, + ]; + + for tc in &test_cases { + let formatted_date_scalar = + ScalarValue::Utf8(Some(tc.formatted_date.to_string())); + let format_scalar = ScalarValue::Utf8(Some(tc.format_str.to_string())); + + let to_date_result = ToDateFunc::new().invoke(&[ + ColumnarValue::Scalar(formatted_date_scalar), + ColumnarValue::Scalar(format_scalar), + ]); + + match to_date_result { + Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => { + let expected = Date32Type::parse_formatted(tc.date_str, "%Y-%m-%d"); + assert_eq!(date_val, expected, "{}: to_date created wrong value for date '{}' with format string '{}'", tc.name, tc.formatted_date, tc.format_str); + } + _ => panic!( + "Could not convert '{}' with format string '{}'to Date", + tc.date_str, tc.format_str + ), + } + } + } + + #[test] + fn test_to_date_multiple_format_strings() { + let formatted_date_scalar = ScalarValue::Utf8(Some("2023/01/31".into())); + let format1_scalar = ScalarValue::Utf8(Some("%Y-%m-%d".into())); + let format2_scalar = ScalarValue::Utf8(Some("%Y/%m/%d".into())); + + let to_date_result = ToDateFunc::new().invoke(&[ + ColumnarValue::Scalar(formatted_date_scalar), + ColumnarValue::Scalar(format1_scalar), + ColumnarValue::Scalar(format2_scalar), + ]); + + match to_date_result { + Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => { + let expected = Date32Type::parse_formatted("2023-01-31", "%Y-%m-%d"); + assert_eq!( + date_val, expected, + "to_date created wrong value for date with 2 format strings" + ); + } + _ => panic!("Conversion failed",), + } + } + + #[test] + fn test_to_date_from_timestamp() { + let test_cases = vec![ + "2020-09-08T13:42:29Z", + "2020-09-08T13:42:29.190855-05:00", + "2020-09-08 12:13:29", + ]; + for date_str in test_cases { + let formatted_date_scalar = ScalarValue::Utf8(Some(date_str.into())); + + let to_date_result = + ToDateFunc::new().invoke(&[ColumnarValue::Scalar(formatted_date_scalar)]); + + match to_date_result { + Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => { + let expected = Date32Type::parse_formatted("2020-09-08", "%Y-%m-%d"); + assert_eq!(date_val, expected, "to_date created wrong value"); + } + _ => panic!("Conversion of {} failed", date_str), + } + } + } + + #[test] + fn test_to_date_string_with_valid_number() { + let date_str = "20241231"; + let date_scalar = ScalarValue::Utf8(Some(date_str.into())); + + let to_date_result = + ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]); + + match to_date_result { + Ok(ColumnarValue::Scalar(ScalarValue::Date32(date_val))) => { + let expected = Date32Type::parse_formatted("2024-12-31", "%Y-%m-%d"); + assert_eq!( + date_val, expected, + "to_date created wrong value for {}", + date_str + ); + } + _ => panic!("Conversion of {} failed", date_str), + } + } + + #[test] + fn test_to_date_string_with_invalid_number() { + let date_str = "202412311"; + let date_scalar = ScalarValue::Utf8(Some(date_str.into())); + + let to_date_result = + ToDateFunc::new().invoke(&[ColumnarValue::Scalar(date_scalar)]); + + if let Ok(ColumnarValue::Scalar(ScalarValue::Date32(_))) = to_date_result { + panic!( + "Conversion of {} succeded, but should have failed, ", + date_str + ); + } + } +} diff --git a/datafusion/sqllogictest/test_files/dates.slt b/datafusion/sqllogictest/test_files/dates.slt index e21637bd8913..3950a165a004 100644 --- a/datafusion/sqllogictest/test_files/dates.slt +++ b/datafusion/sqllogictest/test_files/dates.slt @@ -123,6 +123,7 @@ SELECT to_date(ts / 100000000) FROM to_date_t1 LIMIT 3 2003-11-02 2003-11-29 +# verify date with time zone, where the time zone date is already the next day, but result date in UTC is day before query D SELECT to_date('01-14-2023 01:01:30+05:30', '%q', '%d-%m-%Y %H/%M/%S', '%+', '%m-%d-%Y %H:%M:%S%#z'); ---- @@ -137,8 +138,15 @@ select to_date(arrow_cast(123, 'Int64')) ---- 1970-05-04 -statement error DataFusion error: Arrow error: +# Parse sequence of digits which yield a valid date, e.g. "21311111" would be "2131-11-11" +query D SELECT to_date('21311111'); +---- +2131-11-11 + +# Parse sequence of digits which do not make up a valid date +statement error DataFusion error: Arrow error: +SELECT to_date('213111111'); # verify date cast with integer input query DDDDDD diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 80b61f8242ef..c3d3ab7a64a7 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1735,9 +1735,7 @@ Strings are parsed as YYYY-MM-DD (e.g. '2023-07-20') if no [Chrono format]s are Integers and doubles are interpreted as days since the unix epoch (`1970-01-01T00:00:00Z`). Returns the corresponding date. -Note: `to_date` returns Date32. The supported range for integer input is between `-96465293` and `95026237`. -Supported range for string input is between `1677-09-21` and `2262-04-11` exclusive. To parse dates outside of -that range use a [Chrono format]. +Note: `to_date` returns Date32, which represents its values as the number of days since unix epoch(`1970-01-01`) stored as signed 32 bit value. The largest supported date value is `9999-12-31`. ``` to_date(expression[, ..., format_n])