@@ -25,9 +25,15 @@ use crate::{
2525    datasource:: { object_store:: ObjectStore ,  PartitionedFile } , 
2626    error:: Result  as  DataFusionResult , 
2727    physical_plan:: RecordBatchStream , 
28+     scalar:: ScalarValue , 
2829} ; 
2930use  arrow:: { 
30-     datatypes:: SchemaRef , 
31+     array:: { 
32+         make_array,  Array ,  ArrayData ,  ArrayRef ,  DictionaryArray ,  StringArray , 
33+         UInt8BufferBuilder ,  UInt8Builder , 
34+     } , 
35+     buffer:: Buffer , 
36+     datatypes:: { DataType ,  SchemaRef ,  UInt8Type } , 
3137    error:: { ArrowError ,  Result  as  ArrowResult } , 
3238    record_batch:: RecordBatch , 
3339} ; 
@@ -61,8 +67,12 @@ impl<T> FormatReaderOpener for T where
6167
6268/// A stream that iterates record batch by record batch, file over file. 
6369pub  struct  FileStream < F :  FormatReaderOpener >  { 
70+     /// Partitioning column names for this stream 
71+ partition_names :  Vec < String > , 
6472    /// An iterator over record batches of the last file returned by file_iter 
6573batch_iter :  BatchIter , 
74+     /// Partitioning column values for the current batch_iter 
75+ partition_values :  Vec < ScalarValue > , 
6676    /// An iterator over input files 
6777file_iter :  FileIter , 
6878    /// The stream schema (file schema after projection) 
@@ -74,6 +84,10 @@ pub struct FileStream<F: FormatReaderOpener> {
7484/// is not capable of limiting the number of records in the last batch, the file 
7585/// stream will take care of truncating it. 
7686file_reader :  F , 
87+     /// A buffer initialized to zeros that represents the key array of all partition 
88+ /// columns (partition columns are materialized by dictionary arrays with only one 
89+ /// value in the dictionary, thus all the keys are equal to zero) 
90+ key_buffer :  Option < Buffer > , 
7791} 
7892
7993impl < F :  FormatReaderOpener >  FileStream < F >  { 
@@ -83,6 +97,7 @@ impl<F: FormatReaderOpener> FileStream<F> {
8397        file_reader :  F , 
8498        schema :  SchemaRef , 
8599        limit :  Option < usize > , 
100+         partition_names :  Vec < String > , 
86101    )  -> Self  { 
87102        let  read_iter = files. into_iter ( ) . map ( move  |f| -> DataFusionResult < _ >  { 
88103            object_store
@@ -93,16 +108,31 @@ impl<F: FormatReaderOpener> FileStream<F> {
93108        Self  { 
94109            file_iter :  Box :: new ( read_iter) , 
95110            batch_iter :  Box :: new ( iter:: empty ( ) ) , 
111+             partition_values :  vec ! [ ] , 
96112            remain :  limit, 
97113            schema, 
98114            file_reader, 
115+             partition_names, 
116+             key_buffer :  None , 
99117        } 
100118    } 
101119
102-     /// Acts as a flat_map of record batches over files. 
120+     /// Acts as a flat_map of record batches over files. Adds the partitioning 
121+ /// Columns to the returned record batches. 
103122fn  next_batch ( & mut  self )  -> Option < ArrowResult < RecordBatch > >  { 
104123        match  self . batch_iter . next ( )  { 
105-             Some ( batch)  => Some ( batch) , 
124+             Some ( Ok ( batch) )  => { 
125+                 // let mut cols = batch.columns().to_vec(); 
126+                 // let partitions = self.partition_names.iter().zip(&self.partition_values); 
127+                 // for p in partitions { 
128+                 //     cols.push( 
129+                 //         Int8Array::from(std::iter::repeat(0u8).take(batch.num_rows())) 
130+                 //             .finish_dict, 
131+                 //     ) 
132+                 // } 
133+                 Some ( Ok ( batch) ) 
134+             } 
135+             Some ( Err ( e) )  => Some ( Err ( e) ) , 
106136            None  => match  self . file_iter . next ( )  { 
107137                Some ( Ok ( f) )  => { 
108138                    self . batch_iter  = ( self . file_reader ) ( f,  & self . remain ) ; 
@@ -113,6 +143,38 @@ impl<F: FormatReaderOpener> FileStream<F> {
113143            } , 
114144        } 
115145    } 
146+ 
147+     fn  init_and_cache_buf ( & mut  self ,  len :  usize )  -> Buffer  { 
148+         let  mut  key_buffer_builder = UInt8BufferBuilder :: new ( len) ; 
149+         key_buffer_builder. advance ( len) ;  // keys are all 0 
150+         self . key_buffer  = Some ( key_buffer_builder. finish ( ) ) ; 
151+         self . key_buffer . as_ref ( ) . unwrap ( ) . clone ( ) 
152+     } 
153+ 
154+     fn  create_dict_array ( & mut  self ,  val :  & ScalarValue ,  len :  usize )  -> ArrayRef  { 
155+         let  value_datatype = val. get_datatype ( ) ; 
156+ 
157+         // build value dictionary 
158+         let  dict_vals = val. to_array ( ) ; 
159+ 
160+         // build keys array 
161+         let  key_buffer = match  self . key_buffer  { 
162+             None  => self . init_and_cache_buf ( len) , 
163+             Some ( buf)  if  buf. len ( )  < len => self . init_and_cache_buf ( len) , 
164+             Some ( buf)  => buf. slice ( buf. len ( )  - len) , 
165+         } ; 
166+ 
167+         // create data type 
168+         let  data_type =
169+             DataType :: Dictionary ( Box :: new ( DataType :: UInt8 ) ,  Box :: new ( value_datatype) ) ; 
170+ 
171+         // assemble pieces together 
172+         let  mut  builder = ArrayData :: builder ( data_type) 
173+             . len ( len) 
174+             . add_buffer ( key_buffer) ; 
175+         builder = builder. add_child_data ( dict_vals. data ( ) . clone ( ) ) ; 
176+         Arc :: new ( DictionaryArray :: < UInt8Type > :: from ( builder. build ( ) . unwrap ( ) ) ) 
177+     } 
116178} 
117179
118180impl < F :  FormatReaderOpener >  Stream  for  FileStream < F >  { 
0 commit comments