@@ -25,6 +25,7 @@ use crate::{
2525} ;
2626use arrow:: {
2727 array:: { Array , ArrayRef , GenericStringArray , PrimitiveArray , StringOffsetSizeTrait } ,
28+ compute:: kernels:: cast_utils:: string_to_timestamp_nanos,
2829 datatypes:: {
2930 ArrowPrimitiveType , DataType , TimestampMicrosecondType , TimestampMillisecondType ,
3031 TimestampNanosecondType , TimestampSecondType ,
@@ -41,150 +42,10 @@ use arrow::{
4142} ;
4243use chrono:: prelude:: * ;
4344use chrono:: Duration ;
44- use chrono:: LocalResult ;
4545
46- #[ inline]
47- /// Accepts a string in RFC3339 / ISO8601 standard format and some
48- /// variants and converts it to a nanosecond precision timestamp.
49- ///
50- /// Implements the `to_timestamp` function to convert a string to a
51- /// timestamp, following the model of spark SQL’s to_`timestamp`.
52- ///
53- /// In addition to RFC3339 / ISO8601 standard timestamps, it also
54- /// accepts strings that use a space ` ` to separate the date and time
55- /// as well as strings that have no explicit timezone offset.
56- ///
57- /// Examples of accepted inputs:
58- /// * `1997-01-31T09:26:56.123Z` # RCF3339
59- /// * `1997-01-31T09:26:56.123-05:00` # RCF3339
60- /// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a space rather than T
61- /// * `1997-01-31T09:26:56.123` # close to RCF3339 but no timezone offset specified
62- /// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a space and no timezone offset
63- /// * `1997-01-31 09:26:56` # close to RCF3339, no fractional seconds
64- //
65- /// Internally, this function uses the `chrono` library for the
66- /// datetime parsing
67- ///
68- /// We hope to extend this function in the future with a second
69- /// parameter to specifying the format string.
70- ///
71- /// ## Timestamp Precision
72- ///
73- /// DataFusion uses the maximum precision timestamps supported by
74- /// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This
75- /// means the range of dates that timestamps can represent is ~1677 AD
76- /// to 2262 AM
77- ///
78- ///
79- /// ## Timezone / Offset Handling
80- ///
81- /// By using the Arrow format, DataFusion inherits Arrow’s handling of
82- /// timestamp values. Specifically, the stored numerical values of
83- /// timestamps are stored compared to offset UTC.
84- ///
85- /// This function intertprets strings without an explicit time zone as
86- /// timestamps with offsets of the local time on the machine that ran
87- /// the datafusion query
88- ///
89- /// For example, `1997-01-31 09:26:56.123Z` is interpreted as UTC, as
90- /// it has an explicit timezone specifier (“Z” for Zulu/UTC)
91- ///
92- /// `1997-01-31T09:26:56.123` is interpreted as a local timestamp in
93- /// the timezone of the machine that ran DataFusion. For example, if
94- /// the system timezone is set to Americas/New_York (UTC-5) the
95- /// timestamp will be interpreted as though it were
96- /// `1997-01-31T09:26:56.123-05:00`
97- fn string_to_timestamp_nanos ( s : & str ) -> Result < i64 > {
98- // Fast path: RFC3339 timestamp (with a T)
99- // Example: 2020-09-08T13:42:29.190855Z
100- if let Ok ( ts) = DateTime :: parse_from_rfc3339 ( s) {
101- return Ok ( ts. timestamp_nanos ( ) ) ;
102- }
103-
104- // Implement quasi-RFC3339 support by trying to parse the
105- // timestamp with various other format specifiers to to support
106- // separating the date and time with a space ' ' rather than 'T' to be
107- // (more) compatible with Apache Spark SQL
108-
109- // timezone offset, using ' ' as a separator
110- // Example: 2020-09-08 13:42:29.190855-05:00
111- if let Ok ( ts) = DateTime :: parse_from_str ( s, "%Y-%m-%d %H:%M:%S%.f%:z" ) {
112- return Ok ( ts. timestamp_nanos ( ) ) ;
113- }
114-
115- // with an explicit Z, using ' ' as a separator
116- // Example: 2020-09-08 13:42:29Z
117- if let Ok ( ts) = Utc . datetime_from_str ( s, "%Y-%m-%d %H:%M:%S%.fZ" ) {
118- return Ok ( ts. timestamp_nanos ( ) ) ;
119- }
120-
121- // Support timestamps without an explicit timezone offset, again
122- // to be compatible with what Apache Spark SQL does.
123-
124- // without a timezone specifier as a local time, using T as a separator
125- // Example: 2020-09-08T13:42:29.190855
126- if let Ok ( ts) = NaiveDateTime :: parse_from_str ( s, "%Y-%m-%dT%H:%M:%S.%f" ) {
127- return naive_datetime_to_timestamp ( s, ts) ;
128- }
129-
130- // without a timezone specifier as a local time, using T as a
131- // separator, no fractional seconds
132- // Example: 2020-09-08T13:42:29
133- if let Ok ( ts) = NaiveDateTime :: parse_from_str ( s, "%Y-%m-%dT%H:%M:%S" ) {
134- return naive_datetime_to_timestamp ( s, ts) ;
135- }
136-
137- // without a timezone specifier as a local time, using ' ' as a separator
138- // Example: 2020-09-08 13:42:29.190855
139- if let Ok ( ts) = NaiveDateTime :: parse_from_str ( s, "%Y-%m-%d %H:%M:%S.%f" ) {
140- return naive_datetime_to_timestamp ( s, ts) ;
141- }
142-
143- // without a timezone specifier as a local time, using ' ' as a
144- // separator, no fractional seconds
145- // Example: 2020-09-08 13:42:29
146- if let Ok ( ts) = NaiveDateTime :: parse_from_str ( s, "%Y-%m-%d %H:%M:%S" ) {
147- return naive_datetime_to_timestamp ( s, ts) ;
148- }
149-
150- // Note we don't pass along the error message from the underlying
151- // chrono parsing because we tried several different format
152- // strings and we don't know which the user was trying to
153- // match. Ths any of the specific error messages is likely to be
154- // be more confusing than helpful
155- Err ( DataFusionError :: Execution ( format ! (
156- "Error parsing '{}' as timestamp" ,
157- s
158- ) ) )
159- }
160-
161- /// Converts the naive datetime (which has no specific timezone) to a
162- /// nanosecond epoch timestamp relative to UTC.
163- fn naive_datetime_to_timestamp ( s : & str , datetime : NaiveDateTime ) -> Result < i64 > {
164- let l = Local { } ;
165-
166- match l. from_local_datetime ( & datetime) {
167- LocalResult :: None => Err ( DataFusionError :: Execution ( format ! (
168- "Error parsing '{}' as timestamp: local time representation is invalid" ,
169- s
170- ) ) ) ,
171- LocalResult :: Single ( local_datetime) => {
172- Ok ( local_datetime. with_timezone ( & Utc ) . timestamp_nanos ( ) )
173- }
174- // Ambiguous times can happen if the timestamp is exactly when
175- // a daylight savings time transition occurs, for example, and
176- // so the datetime could validly be said to be in two
177- // potential offsets. However, since we are about to convert
178- // to UTC anyways, we can pick one arbitrarily
179- LocalResult :: Ambiguous ( local_datetime, _) => {
180- Ok ( local_datetime. with_timezone ( & Utc ) . timestamp_nanos ( ) )
181- }
182- }
183- }
184-
185- // given a function `op` that maps a `&str` to a Result of an arrow native type,
186- // returns a `PrimitiveArray` after the application
187- // of the function to `args[0]`.
46+ /// given a function `op` that maps a `&str` to a Result of an arrow native type,
47+ /// returns a `PrimitiveArray` after the application
48+ /// of the function to `args[0]`.
18849/// # Errors
18950/// This function errors iff:
19051/// * the number of arguments is not 1 or
@@ -262,11 +123,16 @@ where
262123 }
263124}
264125
126+ /// Calls string_to_timestamp_nanos and converts the error type
127+ fn string_to_timestamp_nanos_shim ( s : & str ) -> Result < i64 > {
128+ string_to_timestamp_nanos ( s) . map_err ( |e| e. into ( ) )
129+ }
130+
265131/// to_timestamp SQL function
266132pub fn to_timestamp ( args : & [ ColumnarValue ] ) -> Result < ColumnarValue > {
267133 handle :: < TimestampNanosecondType , _ , TimestampNanosecondType > (
268134 args,
269- string_to_timestamp_nanos ,
135+ string_to_timestamp_nanos_shim ,
270136 "to_timestamp" ,
271137 )
272138}
@@ -275,7 +141,7 @@ pub fn to_timestamp(args: &[ColumnarValue]) -> Result<ColumnarValue> {
275141pub fn to_timestamp_millis ( args : & [ ColumnarValue ] ) -> Result < ColumnarValue > {
276142 handle :: < TimestampMillisecondType , _ , TimestampMillisecondType > (
277143 args,
278- |s| string_to_timestamp_nanos ( s) . map ( |n| n / 1_000_000 ) ,
144+ |s| string_to_timestamp_nanos_shim ( s) . map ( |n| n / 1_000_000 ) ,
279145 "to_timestamp_millis" ,
280146 )
281147}
@@ -284,7 +150,7 @@ pub fn to_timestamp_millis(args: &[ColumnarValue]) -> Result<ColumnarValue> {
284150pub fn to_timestamp_micros ( args : & [ ColumnarValue ] ) -> Result < ColumnarValue > {
285151 handle :: < TimestampMicrosecondType , _ , TimestampMicrosecondType > (
286152 args,
287- |s| string_to_timestamp_nanos ( s) . map ( |n| n / 1_000 ) ,
153+ |s| string_to_timestamp_nanos_shim ( s) . map ( |n| n / 1_000 ) ,
288154 "to_timestamp_micros" ,
289155 )
290156}
@@ -293,7 +159,7 @@ pub fn to_timestamp_micros(args: &[ColumnarValue]) -> Result<ColumnarValue> {
293159pub fn to_timestamp_seconds ( args : & [ ColumnarValue ] ) -> Result < ColumnarValue > {
294160 handle :: < TimestampSecondType , _ , TimestampSecondType > (
295161 args,
296- |s| string_to_timestamp_nanos ( s) . map ( |n| n / 1_000_000_000 ) ,
162+ |s| string_to_timestamp_nanos_shim ( s) . map ( |n| n / 1_000_000_000 ) ,
297163 "to_timestamp_seconds" ,
298164 )
299165}
0 commit comments