Skip to content

Commit 5151135

Browse files
authored
Use consistent version of string_to_timestamp_nanos in DataFusion (#767)
* Use consistent version of string_to_timestamp_nanos in DataFusion * fixup comments * Use upstream string_to_timestamp_nanos and remove copy in DataFusion
1 parent 2a4f94e commit 5151135

File tree

2 files changed

+14
-148
lines changed

2 files changed

+14
-148
lines changed

datafusion/src/optimizer/constant_folding.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
2121
use std::sync::Arc;
2222

23+
use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
2324
use arrow::datatypes::DataType;
2425

2526
use crate::error::Result;
@@ -29,7 +30,6 @@ use crate::optimizer::optimizer::OptimizerRule;
2930
use crate::optimizer::utils;
3031
use crate::physical_plan::functions::BuiltinScalarFunction;
3132
use crate::scalar::ScalarValue;
32-
use arrow::compute::kernels::cast_utils::string_to_timestamp_nanos;
3333
use arrow::compute::{kernels, DEFAULT_CAST_OPTIONS};
3434

3535
/// Optimizer that simplifies comparison expressions involving boolean literals.

datafusion/src/physical_plan/datetime_expressions.rs

Lines changed: 13 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ use crate::{
2525
};
2626
use arrow::{
2727
array::{Array, ArrayRef, GenericStringArray, PrimitiveArray, StringOffsetSizeTrait},
28+
compute::kernels::cast_utils::string_to_timestamp_nanos,
2829
datatypes::{
2930
ArrowPrimitiveType, DataType, TimestampMicrosecondType, TimestampMillisecondType,
3031
TimestampNanosecondType, TimestampSecondType,
@@ -41,150 +42,10 @@ use arrow::{
4142
};
4243
use chrono::prelude::*;
4344
use chrono::Duration;
44-
use chrono::LocalResult;
4545

46-
#[inline]
47-
/// Accepts a string in RFC3339 / ISO8601 standard format and some
48-
/// variants and converts it to a nanosecond precision timestamp.
49-
///
50-
/// Implements the `to_timestamp` function to convert a string to a
51-
/// timestamp, following the model of spark SQL’s to_`timestamp`.
52-
///
53-
/// In addition to RFC3339 / ISO8601 standard timestamps, it also
54-
/// accepts strings that use a space ` ` to separate the date and time
55-
/// as well as strings that have no explicit timezone offset.
56-
///
57-
/// Examples of accepted inputs:
58-
/// * `1997-01-31T09:26:56.123Z` # RCF3339
59-
/// * `1997-01-31T09:26:56.123-05:00` # RCF3339
60-
/// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T
61-
/// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified
62-
/// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset
63-
/// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds
64-
//
65-
/// Internally, this function uses the `chrono` library for the
66-
/// datetime parsing
67-
///
68-
/// We hope to extend this function in the future with a second
69-
/// parameter to specifying the format string.
70-
///
71-
/// ## Timestamp Precision
72-
///
73-
/// DataFusion uses the maximum precision timestamps supported by
74-
/// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This
75-
/// means the range of dates that timestamps can represent is ~1677 AD
76-
/// to 2262 AM
77-
///
78-
///
79-
/// ## Timezone / Offset Handling
80-
///
81-
/// By using the Arrow format, DataFusion inherits Arrow’s handling of
82-
/// timestamp values. Specifically, the stored numerical values of
83-
/// timestamps are stored compared to offset UTC.
84-
///
85-
/// This function intertprets strings without an explicit time zone as
86-
/// timestamps with offsets of the local time on the machine that ran
87-
/// the datafusion query
88-
///
89-
/// For example, `1997-01-31 09:26:56.123Z` is interpreted as UTC, as
90-
/// it has an explicit timezone specifier (“Z” for Zulu/UTC)
91-
///
92-
/// `1997-01-31T09:26:56.123` is interpreted as a local timestamp in
93-
/// the timezone of the machine that ran DataFusion. For example, if
94-
/// the system timezone is set to Americas/New_York (UTC-5) the
95-
/// timestamp will be interpreted as though it were
96-
/// `1997-01-31T09:26:56.123-05:00`
97-
fn string_to_timestamp_nanos(s: &str) -> Result<i64> {
98-
// Fast path: RFC3339 timestamp (with a T)
99-
// Example: 2020-09-08T13:42:29.190855Z
100-
if let Ok(ts) = DateTime::parse_from_rfc3339(s) {
101-
return Ok(ts.timestamp_nanos());
102-
}
103-
104-
// Implement quasi-RFC3339 support by trying to parse the
105-
// timestamp with various other format specifiers to to support
106-
// separating the date and time with a space ' ' rather than 'T' to be
107-
// (more) compatible with Apache Spark SQL
108-
109-
// timezone offset, using ' ' as a separator
110-
// Example: 2020-09-08 13:42:29.190855-05:00
111-
if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") {
112-
return Ok(ts.timestamp_nanos());
113-
}
114-
115-
// with an explicit Z, using ' ' as a separator
116-
// Example: 2020-09-08 13:42:29Z
117-
if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") {
118-
return Ok(ts.timestamp_nanos());
119-
}
120-
121-
// Support timestamps without an explicit timezone offset, again
122-
// to be compatible with what Apache Spark SQL does.
123-
124-
// without a timezone specifier as a local time, using T as a separator
125-
// Example: 2020-09-08T13:42:29.190855
126-
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f") {
127-
return naive_datetime_to_timestamp(s, ts);
128-
}
129-
130-
// without a timezone specifier as a local time, using T as a
131-
// separator, no fractional seconds
132-
// Example: 2020-09-08T13:42:29
133-
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") {
134-
return naive_datetime_to_timestamp(s, ts);
135-
}
136-
137-
// without a timezone specifier as a local time, using ' ' as a separator
138-
// Example: 2020-09-08 13:42:29.190855
139-
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S.%f") {
140-
return naive_datetime_to_timestamp(s, ts);
141-
}
142-
143-
// without a timezone specifier as a local time, using ' ' as a
144-
// separator, no fractional seconds
145-
// Example: 2020-09-08 13:42:29
146-
if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") {
147-
return naive_datetime_to_timestamp(s, ts);
148-
}
149-
150-
// Note we don't pass along the error message from the underlying
151-
// chrono parsing because we tried several different format
152-
// strings and we don't know which the user was trying to
153-
// match. Ths any of the specific error messages is likely to be
154-
// be more confusing than helpful
155-
Err(DataFusionError::Execution(format!(
156-
"Error parsing '{}' as timestamp",
157-
s
158-
)))
159-
}
160-
161-
/// Converts the naive datetime (which has no specific timezone) to a
162-
/// nanosecond epoch timestamp relative to UTC.
163-
fn naive_datetime_to_timestamp(s: &str, datetime: NaiveDateTime) -> Result<i64> {
164-
let l = Local {};
165-
166-
match l.from_local_datetime(&datetime) {
167-
LocalResult::None => Err(DataFusionError::Execution(format!(
168-
"Error parsing '{}' as timestamp: local time representation is invalid",
169-
s
170-
))),
171-
LocalResult::Single(local_datetime) => {
172-
Ok(local_datetime.with_timezone(&Utc).timestamp_nanos())
173-
}
174-
// Ambiguous times can happen if the timestamp is exactly when
175-
// a daylight savings time transition occurs, for example, and
176-
// so the datetime could validly be said to be in two
177-
// potential offsets. However, since we are about to convert
178-
// to UTC anyways, we can pick one arbitrarily
179-
LocalResult::Ambiguous(local_datetime, _) => {
180-
Ok(local_datetime.with_timezone(&Utc).timestamp_nanos())
181-
}
182-
}
183-
}
184-
185-
// given a function `op` that maps a `&str` to a Result of an arrow native type,
186-
// returns a `PrimitiveArray` after the application
187-
// of the function to `args[0]`.
46+
/// given a function `op` that maps a `&str` to a Result of an arrow native type,
47+
/// returns a `PrimitiveArray` after the application
48+
/// of the function to `args[0]`.
18849
/// # Errors
18950
/// This function errors iff:
19051
/// * the number of arguments is not 1 or
@@ -262,11 +123,16 @@ where
262123
}
263124
}
264125

126+
/// Calls string_to_timestamp_nanos and converts the error type
127+
fn string_to_timestamp_nanos_shim(s: &str) -> Result<i64> {
128+
string_to_timestamp_nanos(s).map_err(|e| e.into())
129+
}
130+
265131
/// to_timestamp SQL function
266132
pub fn to_timestamp(args: &[ColumnarValue]) -> Result<ColumnarValue> {
267133
handle::<TimestampNanosecondType, _, TimestampNanosecondType>(
268134
args,
269-
string_to_timestamp_nanos,
135+
string_to_timestamp_nanos_shim,
270136
"to_timestamp",
271137
)
272138
}
@@ -275,7 +141,7 @@ pub fn to_timestamp(args: &[ColumnarValue]) -> Result<ColumnarValue> {
275141
pub fn to_timestamp_millis(args: &[ColumnarValue]) -> Result<ColumnarValue> {
276142
handle::<TimestampMillisecondType, _, TimestampMillisecondType>(
277143
args,
278-
|s| string_to_timestamp_nanos(s).map(|n| n / 1_000_000),
144+
|s| string_to_timestamp_nanos_shim(s).map(|n| n / 1_000_000),
279145
"to_timestamp_millis",
280146
)
281147
}
@@ -284,7 +150,7 @@ pub fn to_timestamp_millis(args: &[ColumnarValue]) -> Result<ColumnarValue> {
284150
pub fn to_timestamp_micros(args: &[ColumnarValue]) -> Result<ColumnarValue> {
285151
handle::<TimestampMicrosecondType, _, TimestampMicrosecondType>(
286152
args,
287-
|s| string_to_timestamp_nanos(s).map(|n| n / 1_000),
153+
|s| string_to_timestamp_nanos_shim(s).map(|n| n / 1_000),
288154
"to_timestamp_micros",
289155
)
290156
}
@@ -293,7 +159,7 @@ pub fn to_timestamp_micros(args: &[ColumnarValue]) -> Result<ColumnarValue> {
293159
pub fn to_timestamp_seconds(args: &[ColumnarValue]) -> Result<ColumnarValue> {
294160
handle::<TimestampSecondType, _, TimestampSecondType>(
295161
args,
296-
|s| string_to_timestamp_nanos(s).map(|n| n / 1_000_000_000),
162+
|s| string_to_timestamp_nanos_shim(s).map(|n| n / 1_000_000_000),
297163
"to_timestamp_seconds",
298164
)
299165
}

0 commit comments

Comments
 (0)