@@ -458,7 +458,7 @@ impl Codec {
458458 let nulls = converter. convert_columns ( & [ null_array] ) ?;
459459
460460 let owned = OwnedRow {
461- data : nulls. buffer ,
461+ data : nulls. buffer . into ( ) ,
462462 config : nulls. config ,
463463 } ;
464464 Ok ( Self :: DictionaryValues ( converter, owned) )
@@ -496,7 +496,7 @@ impl Codec {
496496
497497 let nulls = converter. convert_columns ( & nulls) ?;
498498 let owned = OwnedRow {
499- data : nulls. buffer ,
499+ data : nulls. buffer . into ( ) ,
500500 config : nulls. config ,
501501 } ;
502502
@@ -715,7 +715,13 @@ impl RowConverter {
715715 columns. iter ( ) . zip ( self . fields . iter ( ) ) . zip ( encoders)
716716 {
717717 // We encode a column at a time to minimise dispatch overheads
718- encode_column ( & mut rows, column. as_ref ( ) , field. options , & encoder)
718+ encode_column (
719+ & mut rows. buffer ,
720+ & mut rows. offsets ,
721+ column. as_ref ( ) ,
722+ field. options ,
723+ & encoder,
724+ )
719725 }
720726
721727 if cfg ! ( debug_assertions) {
@@ -756,6 +762,48 @@ impl RowConverter {
756762 unsafe { self . convert_raw ( & mut rows, validate_utf8) }
757763 }
758764
765+ /// Returns an empty [`Rows`] with capacity for `row_capacity` rows with
766+ /// a total length of `data_capacity`
767+ ///
768+ /// This can be used to buffer a selection of [`Row`]
769+ ///
770+ /// ```
771+ /// # use std::sync::Arc;
772+ /// # use std::collections::HashSet;
773+ /// # use arrow_array::cast::AsArray;
774+ /// # use arrow_array::StringArray;
775+ /// # use arrow_row::{Row, RowConverter, SortField};
776+ /// # use arrow_schema::DataType;
777+ /// #
778+ /// let mut converter = RowConverter::new(vec![SortField::new(DataType::Utf8)]).unwrap();
779+ /// let array = StringArray::from(vec!["hello", "world", "a", "a", "hello"]);
780+ ///
781+ /// // Convert to row format and deduplicate
782+ /// let converted = converter.convert_columns(&[Arc::new(array)]).unwrap();
783+ /// let mut distinct_rows = converter.empty_rows(3, 100);
784+ /// let mut dedup: HashSet<Row> = HashSet::with_capacity(3);
785+ /// converted.iter().filter(|row| dedup.insert(*row)).for_each(|row| distinct_rows.push(row));
786+ ///
787+ /// // Note: we could skip buffering and feed the filtered iterator directly
788+ /// // into convert_rows, this is done for demonstration purposes only
789+ /// let distinct = converter.convert_rows(&distinct_rows).unwrap();
790+ /// let values: Vec<_> = distinct[0].as_string::<i32>().iter().map(Option::unwrap).collect();
791+ /// assert_eq!(&values, &["hello", "world", "a"]);
792+ /// ```
793+ pub fn empty_rows ( & self , row_capacity : usize , data_capacity : usize ) -> Rows {
794+ let mut offsets = Vec :: with_capacity ( row_capacity. saturating_add ( 1 ) ) ;
795+ offsets. push ( 0 ) ;
796+
797+ Rows {
798+ offsets,
799+ buffer : Vec :: with_capacity ( data_capacity) ,
800+ config : RowConfig {
801+ fields : self . fields . clone ( ) ,
802+ validate_utf8 : false ,
803+ } ,
804+ }
805+ }
806+
759807 /// Convert raw bytes into [`ArrayRef`]
760808 ///
761809 /// # Safety
@@ -832,14 +880,25 @@ struct RowConfig {
832880#[ derive( Debug ) ]
833881pub struct Rows {
834882 /// Underlying row bytes
835- buffer : Box < [ u8 ] > ,
883+ buffer : Vec < u8 > ,
836884 /// Row `i` has data `&buffer[offsets[i]..offsets[i+1]]`
837- offsets : Box < [ usize ] > ,
885+ offsets : Vec < usize > ,
838886 /// The config for these rows
839887 config : RowConfig ,
840888}
841889
842890impl Rows {
891+ /// Append a [`Row`] to this [`Rows`]
892+ pub fn push ( & mut self , row : Row < ' _ > ) {
893+ assert ! (
894+ Arc :: ptr_eq( & row. config. fields, & self . config. fields) ,
895+ "row was not produced by this RowConverter"
896+ ) ;
897+ self . config . validate_utf8 |= row. config . validate_utf8 ;
898+ self . buffer . extend_from_slice ( row. data ) ;
899+ self . offsets . push ( self . buffer . len ( ) )
900+ }
901+
843902 pub fn row ( & self , row : usize ) -> Row < ' _ > {
844903 let end = self . offsets [ row + 1 ] ;
845904 let start = self . offsets [ row] ;
@@ -1171,66 +1230,67 @@ fn new_empty_rows(cols: &[ArrayRef], encoders: &[Encoder], config: RowConfig) ->
11711230 let buffer = vec ! [ 0_u8 ; cur_offset] ;
11721231
11731232 Rows {
1174- buffer : buffer . into ( ) ,
1175- offsets : offsets . into ( ) ,
1233+ buffer,
1234+ offsets,
11761235 config,
11771236 }
11781237}
11791238
11801239/// Encodes a column to the provided [`Rows`] incrementing the offsets as it progresses
11811240fn encode_column (
1182- out : & mut Rows ,
1241+ data : & mut [ u8 ] ,
1242+ offsets : & mut [ usize ] ,
11831243 column : & dyn Array ,
11841244 opts : SortOptions ,
11851245 encoder : & Encoder < ' _ > ,
11861246) {
11871247 match encoder {
11881248 Encoder :: Stateless => {
11891249 downcast_primitive_array ! {
1190- column => fixed:: encode( out , column, opts) ,
1250+ column => fixed:: encode( data , offsets , column, opts) ,
11911251 DataType :: Null => { }
1192- DataType :: Boolean => fixed:: encode( out , column. as_boolean( ) , opts) ,
1252+ DataType :: Boolean => fixed:: encode( data , offsets , column. as_boolean( ) , opts) ,
11931253 DataType :: Binary => {
1194- variable:: encode( out , as_generic_binary_array:: <i32 >( column) . iter( ) , opts)
1254+ variable:: encode( data , offsets , as_generic_binary_array:: <i32 >( column) . iter( ) , opts)
11951255 }
11961256 DataType :: LargeBinary => {
1197- variable:: encode( out , as_generic_binary_array:: <i64 >( column) . iter( ) , opts)
1257+ variable:: encode( data , offsets , as_generic_binary_array:: <i64 >( column) . iter( ) , opts)
11981258 }
11991259 DataType :: Utf8 => variable:: encode(
1200- out ,
1260+ data , offsets ,
12011261 column. as_string:: <i32 >( ) . iter( ) . map( |x| x. map( |x| x. as_bytes( ) ) ) ,
12021262 opts,
12031263 ) ,
12041264 DataType :: LargeUtf8 => variable:: encode(
1205- out ,
1265+ data , offsets ,
12061266 column. as_string:: <i64 >( )
12071267 . iter( )
12081268 . map( |x| x. map( |x| x. as_bytes( ) ) ) ,
12091269 opts,
12101270 ) ,
12111271 DataType :: FixedSizeBinary ( _) => {
12121272 let array = column. as_any( ) . downcast_ref( ) . unwrap( ) ;
1213- fixed:: encode_fixed_size_binary( out , array, opts)
1273+ fixed:: encode_fixed_size_binary( data , offsets , array, opts)
12141274 }
12151275 _ => unreachable!( ) ,
12161276 }
12171277 }
12181278 Encoder :: Dictionary ( dict) => {
12191279 downcast_dictionary_array ! {
1220- column => encode_dictionary( out , column, dict, opts) ,
1280+ column => encode_dictionary( data , offsets , column, dict, opts) ,
12211281 _ => unreachable!( )
12221282 }
12231283 }
12241284 Encoder :: DictionaryValues ( values, nulls) => {
12251285 downcast_dictionary_array ! {
1226- column => encode_dictionary_values( out , column, values, nulls) ,
1286+ column => encode_dictionary_values( data , offsets , column, values, nulls) ,
12271287 _ => unreachable!( )
12281288 }
12291289 }
12301290 Encoder :: Struct ( rows, null) => {
12311291 let array = as_struct_array ( column) ;
12321292 let null_sentinel = null_sentinel ( opts) ;
1233- out . offsets
1293+ offsets
12341294 . iter_mut ( )
12351295 . skip ( 1 )
12361296 . enumerate ( )
@@ -1240,15 +1300,17 @@ fn encode_column(
12401300 false => ( * null, null_sentinel) ,
12411301 } ;
12421302 let end_offset = * offset + 1 + row. as_ref ( ) . len ( ) ;
1243- out . buffer [ * offset] = sentinel;
1244- out . buffer [ * offset + 1 ..end_offset] . copy_from_slice ( row. as_ref ( ) ) ;
1303+ data [ * offset] = sentinel;
1304+ data [ * offset + 1 ..end_offset] . copy_from_slice ( row. as_ref ( ) ) ;
12451305 * offset = end_offset;
12461306 } )
12471307 }
12481308 Encoder :: List ( rows) => match column. data_type ( ) {
1249- DataType :: List ( _) => list:: encode ( out, rows, opts, as_list_array ( column) ) ,
1309+ DataType :: List ( _) => {
1310+ list:: encode ( data, offsets, rows, opts, as_list_array ( column) )
1311+ }
12501312 DataType :: LargeList ( _) => {
1251- list:: encode ( out , rows, opts, as_large_list_array ( column) )
1313+ list:: encode ( data , offsets , rows, opts, as_large_list_array ( column) )
12521314 }
12531315 _ => unreachable ! ( ) ,
12541316 } ,
@@ -1384,9 +1446,9 @@ mod tests {
13841446 . unwrap ( ) ;
13851447 let rows = converter. convert_columns ( & cols) . unwrap ( ) ;
13861448
1387- assert_eq ! ( rows. offsets. as_ref ( ) , & [ 0 , 8 , 16 , 24 , 32 , 40 , 48 , 56 ] ) ;
1449+ assert_eq ! ( rows. offsets, & [ 0 , 8 , 16 , 24 , 32 , 40 , 48 , 56 ] ) ;
13881450 assert_eq ! (
1389- rows. buffer. as_ref ( ) ,
1451+ rows. buffer,
13901452 & [
13911453 1 , 128 , 1 , //
13921454 1 , 191 , 166 , 102 , 102 , //
0 commit comments