Skip to content

Commit a5f7714

Browse files
Omega359alamb
andauthored
move the Translate, SubstrIndex, FindInSet functions to datafusion-functions (#9864)
* Fix to_timestamp benchmark * Remove reference to simd and nightly build as simd is no longer an available feature in DataFusion and building with nightly may not be a good recommendation when getting started. * Fixed missing trim() function. * Create unicode module in datafusion/functions/src/unicode and unicode_expressions feature flag, move char_length function * move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions * move strpos, substr functions to datafusion_functions * move the Translate, SubstrIndex, FindInSet functions to new datafusion-functions crate * Test code cleanup * unicode_expressions Cargo.toml updates. --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent d896000 commit a5f7714

File tree

19 files changed

+510
-422
lines changed

19 files changed

+510
-422
lines changed

datafusion-cli/Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/core/Cargo.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,6 @@ regex_expressions = [
6969
serde = ["arrow-schema/serde"]
7070
string_expressions = ["datafusion-functions/string_expressions"]
7171
unicode_expressions = [
72-
"datafusion-physical-expr/unicode_expressions",
73-
"datafusion-optimizer/unicode_expressions",
7472
"datafusion-sql/unicode_expressions",
7573
"datafusion-functions/unicode_expressions",
7674
]

datafusion/expr/src/built_in_function.rs

Lines changed: 0 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,6 @@ pub enum BuiltinScalarFunction {
107107
InitCap,
108108
/// random
109109
Random,
110-
/// translate
111-
Translate,
112-
/// substr_index
113-
SubstrIndex,
114-
/// find_in_set
115-
FindInSet,
116110
}
117111

118112
/// Maps the sql function name to `BuiltinScalarFunction`
@@ -198,9 +192,6 @@ impl BuiltinScalarFunction {
198192
BuiltinScalarFunction::EndsWith => Volatility::Immutable,
199193
BuiltinScalarFunction::InitCap => Volatility::Immutable,
200194
BuiltinScalarFunction::Radians => Volatility::Immutable,
201-
BuiltinScalarFunction::Translate => Volatility::Immutable,
202-
BuiltinScalarFunction::SubstrIndex => Volatility::Immutable,
203-
BuiltinScalarFunction::FindInSet => Volatility::Immutable,
204195

205196
// Volatile builtin functions
206197
BuiltinScalarFunction::Random => Volatility::Volatile,
@@ -237,15 +228,6 @@ impl BuiltinScalarFunction {
237228
BuiltinScalarFunction::Pi => Ok(Float64),
238229
BuiltinScalarFunction::Random => Ok(Float64),
239230
BuiltinScalarFunction::EndsWith => Ok(Boolean),
240-
BuiltinScalarFunction::SubstrIndex => {
241-
utf8_to_str_type(&input_expr_types[0], "substr_index")
242-
}
243-
BuiltinScalarFunction::FindInSet => {
244-
utf8_to_int_type(&input_expr_types[0], "find_in_set")
245-
}
246-
BuiltinScalarFunction::Translate => {
247-
utf8_to_str_type(&input_expr_types[0], "translate")
248-
}
249231

250232
BuiltinScalarFunction::Factorial
251233
| BuiltinScalarFunction::Gcd
@@ -326,22 +308,6 @@ impl BuiltinScalarFunction {
326308
],
327309
self.volatility(),
328310
),
329-
330-
BuiltinScalarFunction::SubstrIndex => Signature::one_of(
331-
vec![
332-
Exact(vec![Utf8, Utf8, Int64]),
333-
Exact(vec![LargeUtf8, LargeUtf8, Int64]),
334-
],
335-
self.volatility(),
336-
),
337-
BuiltinScalarFunction::FindInSet => Signature::one_of(
338-
vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])],
339-
self.volatility(),
340-
),
341-
342-
BuiltinScalarFunction::Translate => {
343-
Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])], self.volatility())
344-
}
345311
BuiltinScalarFunction::Pi => Signature::exact(vec![], self.volatility()),
346312
BuiltinScalarFunction::Random => Signature::exact(vec![], self.volatility()),
347313
BuiltinScalarFunction::Power => Signature::one_of(
@@ -492,9 +458,6 @@ impl BuiltinScalarFunction {
492458
BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"],
493459
BuiltinScalarFunction::EndsWith => &["ends_with"],
494460
BuiltinScalarFunction::InitCap => &["initcap"],
495-
BuiltinScalarFunction::Translate => &["translate"],
496-
BuiltinScalarFunction::SubstrIndex => &["substr_index", "substring_index"],
497-
BuiltinScalarFunction::FindInSet => &["find_in_set"],
498461
}
499462
}
500463
}
@@ -559,9 +522,6 @@ macro_rules! get_optimal_return_type {
559522
// `utf8_to_str_type`: returns either a Utf8 or LargeUtf8 based on the input type size.
560523
get_optimal_return_type!(utf8_to_str_type, DataType::LargeUtf8, DataType::Utf8);
561524

562-
// `utf8_to_int_type`: returns either a Int32 or Int64 based on the input type size.
563-
get_optimal_return_type!(utf8_to_int_type, DataType::Int64, DataType::Int32);
564-
565525
#[cfg(test)]
566526
mod tests {
567527
use super::*;

datafusion/expr/src/expr_fn.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,6 @@ scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`");
576576

577577
scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase");
578578
scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`");
579-
scalar_expr!(Translate, translate, string from to, "replaces the characters in `from` with the counterpart in `to`");
580579
nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which evaluates to the value of the first [Expr] which is not NULL");
581580
//there is a func concat_ws before, so use concat_ws_expr as name.c
582581
nary_scalar_expr!(
@@ -593,9 +592,6 @@ scalar_expr!(
593592
"returns true if a given number is +0.0 or -0.0 otherwise returns false"
594593
);
595594

596-
scalar_expr!(SubstrIndex, substr_index, string delimiter count, "Returns the substring from str before count occurrences of the delimiter");
597-
scalar_expr!(FindInSet, find_in_set, str strlist, "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings");
598-
599595
/// Create a CASE WHEN statement with literal WHEN expressions for comparison to the base expression.
600596
pub fn case(expr: Expr) -> CaseBuilder {
601597
CaseBuilder::new(Some(Box::new(expr)), vec![], vec![], None)
@@ -1006,8 +1002,5 @@ mod test {
10061002
test_scalar_expr!(Lcm, lcm, arg_1, arg_2);
10071003
test_scalar_expr!(InitCap, initcap, string);
10081004
test_scalar_expr!(EndsWith, ends_with, string, characters);
1009-
test_scalar_expr!(Translate, translate, string, from, to);
1010-
test_scalar_expr!(SubstrIndex, substr_index, string, delimiter, count);
1011-
test_scalar_expr!(FindInSet, find_in_set, string, stringlist);
10121005
}
10131006
}

datafusion/functions/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ regex_expressions = ["regex"]
5454
# enable string functions
5555
string_expressions = ["uuid"]
5656
# enable unicode functions
57-
unicode_expressions = ["unicode-segmentation"]
57+
unicode_expressions = ["hashbrown", "unicode-segmentation"]
5858

5959
[lib]
6060
name = "datafusion_functions"
@@ -72,6 +72,7 @@ datafusion-common = { workspace = true }
7272
datafusion-execution = { workspace = true }
7373
datafusion-expr = { workspace = true }
7474
datafusion-physical-expr = { workspace = true, default-features = true }
75+
hashbrown = { version = "0.14", features = ["raw"], optional = true }
7576
hex = { version = "0.4", optional = true }
7677
itertools = { workspace = true }
7778
log = { workspace = true }
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::any::Any;
19+
use std::sync::Arc;
20+
21+
use arrow::array::{
22+
ArrayRef, ArrowPrimitiveType, GenericStringArray, OffsetSizeTrait, PrimitiveArray,
23+
};
24+
use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type};
25+
26+
use datafusion_common::cast::as_generic_string_array;
27+
use datafusion_common::{exec_err, Result};
28+
use datafusion_expr::TypeSignature::Exact;
29+
use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
30+
31+
use crate::utils::{make_scalar_function, utf8_to_int_type};
32+
33+
#[derive(Debug)]
34+
pub(super) struct FindInSetFunc {
35+
signature: Signature,
36+
}
37+
38+
impl FindInSetFunc {
39+
pub fn new() -> Self {
40+
use DataType::*;
41+
Self {
42+
signature: Signature::one_of(
43+
vec![Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, LargeUtf8])],
44+
Volatility::Immutable,
45+
),
46+
}
47+
}
48+
}
49+
50+
impl ScalarUDFImpl for FindInSetFunc {
51+
fn as_any(&self) -> &dyn Any {
52+
self
53+
}
54+
55+
fn name(&self) -> &str {
56+
"find_in_set"
57+
}
58+
59+
fn signature(&self) -> &Signature {
60+
&self.signature
61+
}
62+
63+
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
64+
utf8_to_int_type(&arg_types[0], "find_in_set")
65+
}
66+
67+
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
68+
match args[0].data_type() {
69+
DataType::Utf8 => {
70+
make_scalar_function(find_in_set::<Int32Type>, vec![])(args)
71+
}
72+
DataType::LargeUtf8 => {
73+
make_scalar_function(find_in_set::<Int64Type>, vec![])(args)
74+
}
75+
other => {
76+
exec_err!("Unsupported data type {other:?} for function find_in_set")
77+
}
78+
}
79+
}
80+
}
81+
82+
///Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings
83+
///A string list is a string composed of substrings separated by , characters.
84+
pub fn find_in_set<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
85+
where
86+
T::Native: OffsetSizeTrait,
87+
{
88+
if args.len() != 2 {
89+
return exec_err!(
90+
"find_in_set was called with {} arguments. It requires 2.",
91+
args.len()
92+
);
93+
}
94+
95+
let str_array: &GenericStringArray<T::Native> =
96+
as_generic_string_array::<T::Native>(&args[0])?;
97+
let str_list_array: &GenericStringArray<T::Native> =
98+
as_generic_string_array::<T::Native>(&args[1])?;
99+
100+
let result = str_array
101+
.iter()
102+
.zip(str_list_array.iter())
103+
.map(|(string, str_list)| match (string, str_list) {
104+
(Some(string), Some(str_list)) => {
105+
let mut res = 0;
106+
let str_set: Vec<&str> = str_list.split(',').collect();
107+
for (idx, str) in str_set.iter().enumerate() {
108+
if str == &string {
109+
res = idx + 1;
110+
break;
111+
}
112+
}
113+
T::Native::from_usize(res)
114+
}
115+
_ => None,
116+
})
117+
.collect::<PrimitiveArray<T>>();
118+
Ok(Arc::new(result) as ArrayRef)
119+
}

datafusion/functions/src/unicode/mod.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,27 +22,33 @@ use std::sync::Arc;
2222
use datafusion_expr::ScalarUDF;
2323

2424
mod character_length;
25+
mod find_in_set;
2526
mod left;
2627
mod lpad;
2728
mod reverse;
2829
mod right;
2930
mod rpad;
3031
mod strpos;
3132
mod substr;
33+
mod substrindex;
34+
mod translate;
3235

3336
// create UDFs
3437
make_udf_function!(
3538
character_length::CharacterLengthFunc,
3639
CHARACTER_LENGTH,
3740
character_length
3841
);
42+
make_udf_function!(find_in_set::FindInSetFunc, FIND_IN_SET, find_in_set);
3943
make_udf_function!(left::LeftFunc, LEFT, left);
4044
make_udf_function!(lpad::LPadFunc, LPAD, lpad);
4145
make_udf_function!(right::RightFunc, RIGHT, right);
4246
make_udf_function!(reverse::ReverseFunc, REVERSE, reverse);
4347
make_udf_function!(rpad::RPadFunc, RPAD, rpad);
4448
make_udf_function!(strpos::StrposFunc, STRPOS, strpos);
4549
make_udf_function!(substr::SubstrFunc, SUBSTR, substr);
50+
make_udf_function!(substrindex::SubstrIndexFunc, SUBSTR_INDEX, substr_index);
51+
make_udf_function!(translate::TranslateFunc, TRANSLATE, translate);
4652

4753
pub mod expr_fn {
4854
use datafusion_expr::Expr;
@@ -57,6 +63,11 @@ pub mod expr_fn {
5763
super::character_length().call(vec![string])
5864
}
5965

66+
#[doc = "Returns a value in the range of 1 to N if the string str is in the string list strlist consisting of N substrings"]
67+
pub fn find_in_set(string: Expr, strlist: Expr) -> Expr {
68+
super::find_in_set().call(vec![string, strlist])
69+
}
70+
6071
#[doc = "finds the position from where the `substring` matches the `string`"]
6172
pub fn instr(string: Expr, substring: Expr) -> Expr {
6273
strpos(string, substring)
@@ -111,18 +122,31 @@ pub mod expr_fn {
111122
pub fn substring(string: Expr, position: Expr, length: Expr) -> Expr {
112123
super::substr().call(vec![string, position, length])
113124
}
125+
126+
#[doc = "Returns the substring from str before count occurrences of the delimiter"]
127+
pub fn substr_index(string: Expr, delimiter: Expr, count: Expr) -> Expr {
128+
super::substr_index().call(vec![string, delimiter, count])
129+
}
130+
131+
#[doc = "replaces the characters in `from` with the counterpart in `to`"]
132+
pub fn translate(string: Expr, from: Expr, to: Expr) -> Expr {
133+
super::translate().call(vec![string, from, to])
134+
}
114135
}
115136

116137
/// Return a list of all functions in this package
117138
pub fn functions() -> Vec<Arc<ScalarUDF>> {
118139
vec![
119140
character_length(),
141+
find_in_set(),
120142
left(),
121143
lpad(),
122144
reverse(),
123145
right(),
124146
rpad(),
125147
strpos(),
126148
substr(),
149+
substr_index(),
150+
translate(),
127151
]
128152
}

0 commit comments

Comments
 (0)