apache · tustvold · Mar 8, 2024 · Mar 2, 2024 · Mar 8, 2024 · Mar 8, 2024
diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
@@ -80,6 +80,32 @@ mod levels;
 ///
 /// assert_eq!(to_write, read);
 /// ```
+///
+/// ## Memory Limiting
+///
+/// The nature of parquet forces buffering of an entire row group before it can be flushed
+/// to the underlying writer. Data is buffered in its encoded form, to reduce memory usage,
+/// but if writing rows containing large strings or very nested data, this may still result in
+/// non-trivial memory usage.
+///
+/// [`ArrowWriter::in_progress_size`] can be used to track the size of the buffered row group,
+/// and potentially trigger an early flush of a row group based on a memory threshold and/or
+/// global memory pressure. However, users should be aware that smaller row groups will result
+/// in higher metadata overheads, and may worsen compression ratios and query performance.
+///
+/// ```no_run
+/// # use std::io::Write;
+/// # use arrow_array::RecordBatch;
+/// # use parquet::arrow::ArrowWriter;
+/// # let mut writer: ArrowWriter<Vec<u8>> = todo!();
+/// # let batch: RecordBatch = todo!();
+/// writer.write(&batch).unwrap();
+/// // Trigger an early flush if buffered size exceeds 1_000_000
+/// if writer.in_progress_size() > 1_000_000 {
+///     writer.flush().unwrap();
+/// }
+/// ```
+///
 pub struct ArrowWriter<W: Write> {
     /// Underlying Parquet writer
     writer: SerializedFileWriter<W>,

diff --git a/parquet/src/arrow/async_writer/mod.rs b/parquet/src/arrow/async_writer/mod.rs
@@ -69,6 +69,29 @@ use tokio::io::{AsyncWrite, AsyncWriteExt};
 /// It is implemented based on the sync writer [`ArrowWriter`] with an inner buffer.
 /// The buffered data will be flushed to the writer provided by caller when the
 /// buffer's threshold is exceeded.
+///
+/// ## Memory Limiting
+///
+/// The nature of parquet forces buffering of an entire row group before it can be flushed
+/// to the underlying writer. This buffering may exceed the configured buffer size
+/// of [`AsyncArrowWriter`]. Memory usage can be limited by prematurely flushing the row group,
+/// although this will have implications for file size and query performance. See [ArrowWriter]
+/// for more information.
+///
+/// ```no_run
+/// # use tokio::fs::File;
+/// # use arrow_array::RecordBatch;
+/// # use parquet::arrow::AsyncArrowWriter;
+/// # async fn test() {
+/// let mut writer: AsyncArrowWriter<File> = todo!();
+/// let batch: RecordBatch = todo!();
+/// writer.write(&batch).await.unwrap();
+/// // Trigger an early flush if buffered size exceeds 1_000_000
+/// if writer.in_progress_size() > 1_000_000 {
+///     writer.flush().await.unwrap()
+/// }
+/// # }
+/// ```
 pub struct AsyncArrowWriter<W> {
     /// Underlying sync writer
     sync_writer: ArrowWriter<SharedBuffer>,
@@ -86,13 +109,10 @@ pub struct AsyncArrowWriter<W> {
 impl<W: AsyncWrite + Unpin + Send> AsyncArrowWriter<W> {
     /// Try to create a new Async Arrow Writer.
     ///
-    /// `buffer_size` determines the number of bytes to buffer before flushing
-    /// to the underlying [`AsyncWrite`]
-    ///
-    /// The intermediate buffer will automatically be resized if necessary
-    ///
-    /// [`Self::write`] will flush this intermediate buffer if it is at least
-    /// half full
+    /// `buffer_size` determines the minimum number of bytes to buffer before flushing
+    /// to the underlying [`AsyncWrite`]. However, the nature of writing parquet may
+    /// force buffering of data in excess of this within the underlying [`ArrowWriter`].
+    /// See the documentation on [`ArrowWriter`] for more details
     pub fn try_new(
         writer: W,
         arrow_schema: SchemaRef,
@@ -105,13 +125,10 @@ impl<W: AsyncWrite + Unpin + Send> AsyncArrowWriter<W> {
 
     /// Try to create a new Async Arrow Writer with [`ArrowWriterOptions`].
     ///
-    /// `buffer_size` determines the number of bytes to buffer before flushing
-    /// to the underlying [`AsyncWrite`]
-    ///
-    /// The intermediate buffer will automatically be resized if necessary
-    ///
-    /// [`Self::write`] will flush this intermediate buffer if it is at least
-    /// half full
+    /// `buffer_size` determines the minimum number of bytes to buffer before flushing
+    /// to the underlying [`AsyncWrite`]. However, the nature of writing parquet may
+    /// force buffering of data in excess of this within the underlying [`ArrowWriter`].
+    /// See the documentation on [`ArrowWriter`] for more details
     pub fn try_new_with_options(
         writer: W,
         arrow_schema: SchemaRef,