15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
- //! Parquet format abstractions
18
+ //! [`ParquetFormat`]: Parquet [`FileFormat`] abstractions
19
19
20
20
use arrow_array:: RecordBatch ;
21
21
use async_trait:: async_trait;
@@ -75,6 +75,17 @@ use crate::physical_plan::{
75
75
Statistics ,
76
76
} ;
77
77
78
+ /// Size of the buffer for [`AsyncArrowWriter`].
79
+ const PARQUET_WRITER_BUFFER_SIZE : usize = 10485760 ;
80
+
81
+ /// Initial writing buffer size. Note this is just a size hint for efficiency. It
82
+ /// will grow beyond the set value if needed.
83
+ const INITIAL_BUFFER_BYTES : usize = 1048576 ;
84
+
85
+ /// When writing parquet files in parallel, if the buffered Parquet data exceeds
86
+ /// this size, it is flushed to object store
87
+ const BUFFER_FLUSH_BYTES : usize = 1024000 ;
88
+
78
89
/// The Apache Parquet `FileFormat` implementation
79
90
///
80
91
/// Note it is recommended these are instead configured on the [`ConfigOptions`]
@@ -680,7 +691,7 @@ impl ParquetSink {
680
691
let writer = AsyncArrowWriter :: try_new (
681
692
multipart_writer,
682
693
self . get_writer_schema ( ) ,
683
- 10485760 ,
694
+ PARQUET_WRITER_BUFFER_SIZE ,
684
695
Some ( parquet_props) ,
685
696
) ?;
686
697
Ok ( writer)
@@ -1004,7 +1015,7 @@ async fn concatenate_parallel_row_groups(
1004
1015
writer_props : Arc < WriterProperties > ,
1005
1016
mut object_store_writer : AbortableWrite < Box < dyn AsyncWrite + Send + Unpin > > ,
1006
1017
) -> Result < usize > {
1007
- let merged_buff = SharedBuffer :: new ( 1048576 ) ;
1018
+ let merged_buff = SharedBuffer :: new ( INITIAL_BUFFER_BYTES ) ;
1008
1019
1009
1020
let schema_desc = arrow_to_parquet_schema ( schema. as_ref ( ) ) ?;
1010
1021
let mut parquet_writer = SerializedFileWriter :: new (
@@ -1025,7 +1036,7 @@ async fn concatenate_parallel_row_groups(
1025
1036
for chunk in serialized_columns {
1026
1037
chunk. append_to_row_group ( & mut rg_out) ?;
1027
1038
let mut buff_to_flush = merged_buff. buffer . try_lock ( ) . unwrap ( ) ;
1028
- if buff_to_flush. len ( ) > 1024000 {
1039
+ if buff_to_flush. len ( ) > BUFFER_FLUSH_BYTES {
1029
1040
object_store_writer
1030
1041
. write_all ( buff_to_flush. as_slice ( ) )
1031
1042
. await ?;
0 commit comments