Skip to content

Commit fdb2d57

Browse files
authored
Improve the performance of ltrim/rtrim/btrim (#10006)
* optimize trim function * fix: the second arg is NULL
1 parent ed37467 commit fdb2d57

File tree

6 files changed

+98
-7
lines changed

6 files changed

+98
-7
lines changed

datafusion/functions/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,3 +113,8 @@ required-features = ["datetime_expressions"]
113113
harness = false
114114
name = "substr_index"
115115
required-features = ["unicode_expressions"]
116+
117+
[[bench]]
118+
harness = false
119+
name = "ltrim"
120+
required-features = ["string_expressions"]
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
extern crate criterion;
19+
20+
use arrow::array::{ArrayRef, StringArray};
21+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
22+
use datafusion_common::ScalarValue;
23+
use datafusion_expr::ColumnarValue;
24+
use datafusion_functions::string;
25+
use std::sync::Arc;
26+
27+
fn create_args(size: usize, characters: &str) -> Vec<ColumnarValue> {
28+
let iter =
29+
std::iter::repeat(format!("{}datafusion{}", characters, characters)).take(size);
30+
let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef;
31+
vec![
32+
ColumnarValue::Array(array),
33+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(characters.to_string()))),
34+
]
35+
}
36+
37+
fn criterion_benchmark(c: &mut Criterion) {
38+
let ltrim = string::ltrim();
39+
for char in ["\"", "Header:"] {
40+
for size in [1024, 4096, 8192] {
41+
let args = create_args(size, char);
42+
c.bench_function(&format!("ltrim {}: {}", char, size), |b| {
43+
b.iter(|| black_box(ltrim.invoke(&args)))
44+
});
45+
}
46+
}
47+
}
48+
49+
criterion_group!(benches, criterion_benchmark);
50+
criterion_main!(benches);

datafusion/functions/src/string/btrim.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
2424
use datafusion_expr::TypeSignature::*;
2525
use datafusion_expr::{ColumnarValue, Volatility};
2626
use datafusion_expr::{ScalarUDFImpl, Signature};
27+
use datafusion_physical_expr::functions::Hint;
2728

2829
use crate::string::common::*;
2930
use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -72,8 +73,14 @@ impl ScalarUDFImpl for BTrimFunc {
7273

7374
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
7475
match args[0].data_type() {
75-
DataType::Utf8 => make_scalar_function(btrim::<i32>, vec![])(args),
76-
DataType::LargeUtf8 => make_scalar_function(btrim::<i64>, vec![])(args),
76+
DataType::Utf8 => make_scalar_function(
77+
btrim::<i32>,
78+
vec![Hint::Pad, Hint::AcceptsSingular],
79+
)(args),
80+
DataType::LargeUtf8 => make_scalar_function(
81+
btrim::<i64>,
82+
vec![Hint::Pad, Hint::AcceptsSingular],
83+
)(args),
7784
other => exec_err!("Unsupported data type {other:?} for function btrim"),
7885
}
7986
}

datafusion/functions/src/string/common.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
use std::fmt::{Display, Formatter};
1919
use std::sync::Arc;
2020

21-
use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait};
21+
use arrow::array::{
22+
new_null_array, Array, ArrayRef, GenericStringArray, OffsetSizeTrait,
23+
};
2224
use arrow::datatypes::DataType;
2325

2426
use datafusion_common::cast::as_generic_string_array;
@@ -78,6 +80,19 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
7880
2 => {
7981
let characters_array = as_generic_string_array::<T>(&args[1])?;
8082

83+
if characters_array.len() == 1 {
84+
if characters_array.is_null(0) {
85+
return Ok(new_null_array(args[0].data_type(), args[0].len()));
86+
}
87+
88+
let characters = characters_array.value(0);
89+
let result = string_array
90+
.iter()
91+
.map(|item| item.map(|string| func(string, characters)))
92+
.collect::<GenericStringArray<T>>();
93+
return Ok(Arc::new(result) as ArrayRef);
94+
}
95+
8196
let result = string_array
8297
.iter()
8398
.zip(characters_array.iter())

datafusion/functions/src/string/ltrim.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
2424
use datafusion_expr::TypeSignature::*;
2525
use datafusion_expr::{ColumnarValue, Volatility};
2626
use datafusion_expr::{ScalarUDFImpl, Signature};
27+
use datafusion_physical_expr::functions::Hint;
2728

2829
use crate::string::common::*;
2930
use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -70,8 +71,14 @@ impl ScalarUDFImpl for LtrimFunc {
7071

7172
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
7273
match args[0].data_type() {
73-
DataType::Utf8 => make_scalar_function(ltrim::<i32>, vec![])(args),
74-
DataType::LargeUtf8 => make_scalar_function(ltrim::<i64>, vec![])(args),
74+
DataType::Utf8 => make_scalar_function(
75+
ltrim::<i32>,
76+
vec![Hint::Pad, Hint::AcceptsSingular],
77+
)(args),
78+
DataType::LargeUtf8 => make_scalar_function(
79+
ltrim::<i64>,
80+
vec![Hint::Pad, Hint::AcceptsSingular],
81+
)(args),
7582
other => exec_err!("Unsupported data type {other:?} for function ltrim"),
7683
}
7784
}

datafusion/functions/src/string/rtrim.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use datafusion_common::{exec_err, Result};
2424
use datafusion_expr::TypeSignature::*;
2525
use datafusion_expr::{ColumnarValue, Volatility};
2626
use datafusion_expr::{ScalarUDFImpl, Signature};
27+
use datafusion_physical_expr::functions::Hint;
2728

2829
use crate::string::common::*;
2930
use crate::utils::{make_scalar_function, utf8_to_str_type};
@@ -70,8 +71,14 @@ impl ScalarUDFImpl for RtrimFunc {
7071

7172
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
7273
match args[0].data_type() {
73-
DataType::Utf8 => make_scalar_function(rtrim::<i32>, vec![])(args),
74-
DataType::LargeUtf8 => make_scalar_function(rtrim::<i64>, vec![])(args),
74+
DataType::Utf8 => make_scalar_function(
75+
rtrim::<i32>,
76+
vec![Hint::Pad, Hint::AcceptsSingular],
77+
)(args),
78+
DataType::LargeUtf8 => make_scalar_function(
79+
rtrim::<i64>,
80+
vec![Hint::Pad, Hint::AcceptsSingular],
81+
)(args),
7582
other => exec_err!("Unsupported data type {other:?} for function rtrim"),
7683
}
7784
}

0 commit comments

Comments
 (0)