@@ -21,7 +21,7 @@ use std::sync::Arc;
2121
2222use arrow:: {
2323 array:: * ,
24- buffer:: { Buffer , MutableBuffer } ,
24+ buffer:: MutableBuffer ,
2525 compute:: kernels:: substring:: { substring as arrow_substring, substring_by_char} ,
2626 datatypes:: { DataType , Int32Type } ,
2727} ;
@@ -87,43 +87,6 @@ pub fn substring(array: &dyn Array, start: i64, length: u64) -> Result<ArrayRef,
8787 }
8888}
8989
90- /// Returns an ArrayRef with a substring starting from `start` and length.
91- ///
92- /// # Preconditions
93- ///
94- /// - `start` can be negative, in which case the start counts from the end of the string.
95- /// - `array` must be either [`StringArray`] or [`LargeStringArray`].
96- ///
97- /// Note: this is different from arrow-rs `substring` kernel in that both `start` and `length` are
98- /// `Int32Array` here.
99- pub fn substring_with_array (
100- array : & dyn Array ,
101- start : & Int32Array ,
102- length : & Int32Array ,
103- ) -> ArrayRef {
104- match array. data_type ( ) {
105- DataType :: LargeUtf8 => generic_substring (
106- array
107- . as_any ( )
108- . downcast_ref :: < LargeStringArray > ( )
109- . expect ( "A large string is expected" ) ,
110- start,
111- length,
112- |i| i as i64 ,
113- ) ,
114- DataType :: Utf8 => generic_substring (
115- array
116- . as_any ( )
117- . downcast_ref :: < StringArray > ( )
118- . expect ( "A string is expected" ) ,
119- start,
120- length,
121- |i| i,
122- ) ,
123- _ => panic ! ( "substring does not support type {:?}" , array. data_type( ) ) ,
124- }
125- }
126-
12790fn generic_string_space < OffsetSize : OffsetSizeTrait > ( length : & Int32Array ) -> ArrayRef {
12891 let array_len = length. len ( ) ;
12992 let mut offsets = MutableBuffer :: new ( ( array_len + 1 ) * std:: mem:: size_of :: < OffsetSize > ( ) ) ;
@@ -163,81 +126,3 @@ fn generic_string_space<OffsetSize: OffsetSizeTrait>(length: &Int32Array) -> Arr
163126 } ;
164127 make_array ( data)
165128}
166-
167- fn generic_substring < OffsetSize : OffsetSizeTrait , F > (
168- array : & GenericStringArray < OffsetSize > ,
169- start : & Int32Array ,
170- length : & Int32Array ,
171- f : F ,
172- ) -> ArrayRef
173- where
174- F : Fn ( i32 ) -> OffsetSize ,
175- {
176- assert_eq ! ( array. len( ) , start. len( ) ) ;
177- assert_eq ! ( array. len( ) , length. len( ) ) ;
178-
179- // compute current offsets
180- let offsets = array. to_data ( ) . buffers ( ) [ 0 ] . clone ( ) ;
181- let offsets: & [ OffsetSize ] = offsets. typed_data :: < OffsetSize > ( ) ;
182-
183- // compute null bitmap (copy)
184- let null_bit_buffer = array. to_data ( ) . nulls ( ) . map ( |b| b. buffer ( ) . clone ( ) ) ;
185-
186- // Gets slices of start and length arrays to access them directly for performance.
187- let start_data = start. to_data ( ) ;
188- let length_data = length. to_data ( ) ;
189- let starts = start_data. buffers ( ) [ 0 ] . typed_data :: < i32 > ( ) ;
190- let lengths = length_data. buffers ( ) [ 0 ] . typed_data :: < i32 > ( ) ;
191-
192- // compute values
193- let array_data = array. to_data ( ) ;
194- let values = & array_data. buffers ( ) [ 1 ] ;
195- let data = values. as_slice ( ) ;
196-
197- // we have no way to estimate how much this will be.
198- let mut new_values = MutableBuffer :: new ( 0 ) ;
199- let mut new_offsets: Vec < OffsetSize > = Vec :: with_capacity ( array. len ( ) + 1 ) ;
200-
201- let mut length_so_far = OffsetSize :: zero ( ) ;
202- new_offsets. push ( length_so_far) ;
203- ( 0 ..array. len ( ) ) . for_each ( |i| {
204- // the length of this entry
205- let length_i: OffsetSize = offsets[ i + 1 ] - offsets[ i] ;
206- // compute where we should start slicing this entry
207- let start_pos: OffsetSize = f ( starts[ i] ) ;
208-
209- let start = offsets[ i]
210- + if start_pos >= OffsetSize :: zero ( ) {
211- start_pos
212- } else {
213- length_i + start_pos
214- } ;
215-
216- let start = start. clamp ( offsets[ i] , offsets[ i + 1 ] ) ;
217- // compute the length of the slice
218- let slice_length: OffsetSize = f ( lengths[ i] . max ( 0 ) ) . min ( offsets[ i + 1 ] - start) ;
219-
220- length_so_far += slice_length;
221-
222- new_offsets. push ( length_so_far) ;
223-
224- // we need usize for ranges
225- let start = start. to_usize ( ) . unwrap ( ) ;
226- let slice_length = slice_length. to_usize ( ) . unwrap ( ) ;
227-
228- new_values. extend_from_slice ( & data[ start..start + slice_length] ) ;
229- } ) ;
230-
231- let data = unsafe {
232- ArrayData :: new_unchecked (
233- GenericStringArray :: < OffsetSize > :: DATA_TYPE ,
234- array. len ( ) ,
235- None ,
236- null_bit_buffer,
237- 0 ,
238- vec ! [ Buffer :: from_slice_ref( & new_offsets) , new_values. into( ) ] ,
239- vec ! [ ] ,
240- )
241- } ;
242- make_array ( data)
243- }
0 commit comments