Undeprecate ArrowWriter::into_serialized_writer and add docs (#8621)

alamb · web-flow · commit 5a384f4c3ccd · 2025-10-16T12:11:56.000-04:00
# Which issue does this PR close? We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. - Related to #7835 # Rationale for this change While testing the arrow 57 upgrade in DataFusion I found a few things that need to be fixed in parquet-rs. - apache/datafusion#17888 One was that the method `ArrowWriter::into_serialized_writer` was deprecated, (which I know I suggested in #8389 🤦 ). However, when testing it turns out that the constructor of `SerializedFileWriter` does a lot of work (like creating the parquet schema from the arrow schema and messing with metadata) https://github.com/apache/arrow-rs/blob/c4f0fc12199df696620c73d62523c8eef5743bf2/parquet/src/arrow/arrow_writer/mod.rs#L230-L263 Creating a `RowGroupWriterFactory` directly would involve a bunch of code duplication # What changes are included in this PR? So let's not deprecate this method for now and instead add some additional docs to guide people to the right lace # Are these changes tested? I tested manually upstream # Are there any user-facing changes? If there are user-facing changes then we may require documentation to be updated before approving the PR. If there are any breaking changes to public APIs, please call them out.
diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs
@@ -450,11 +450,11 @@ impl<W: Write + Send> ArrowWriter<W> {
     }
 
     /// Converts this writer into a lower-level [`SerializedFileWriter`] and [`ArrowRowGroupWriterFactory`].
-    /// This can be useful to provide more control over how files are written.
-    #[deprecated(
-        since = "57.0.0",
-        note = "Construct a `SerializedFileWriter` and `ArrowRowGroupWriterFactory` directly instead"
-    )]
+    ///
+    /// Flushes any outstanding data before returning.
+    ///
+    /// This can be useful to provide more control over how files are written, for example
+    /// to write columns in parallel. See the example on [`ArrowColumnWriter`].
     pub fn into_serialized_writer(
         mut self,
     ) -> Result<(SerializedFileWriter<W>, ArrowRowGroupWriterFactory)> {
@@ -872,6 +872,12 @@ impl ArrowColumnWriter {
 }
 
 /// Encodes [`RecordBatch`] to a parquet row group
+///
+/// Note: this structure is created by [`ArrowRowGroupWriterFactory`] internally used to
+/// create [`ArrowRowGroupWriter`]s, but it is not exposed publicly.
+///
+/// See the example on [`ArrowColumnWriter`] for how to encode columns in parallel
+#[derive(Debug)]
 struct ArrowRowGroupWriter {
     writers: Vec<ArrowColumnWriter>,
     schema: SchemaRef,
@@ -907,6 +913,10 @@ impl ArrowRowGroupWriter {
 }
 
 /// Factory that creates new column writers for each row group in the Parquet file.
+///
+/// You can create this structure via an [`ArrowWriter::into_serialized_writer`].
+/// See the example on [`ArrowColumnWriter`] for how to encode columns in parallel
+#[derive(Debug)]
 pub struct ArrowRowGroupWriterFactory {
     schema: SchemaDescPtr,
     arrow_schema: SchemaRef,
@@ -937,7 +947,7 @@ impl ArrowRowGroupWriterFactory {
         Ok(ArrowRowGroupWriter::new(writers, &self.arrow_schema))
     }
 
-    /// Create column writers for a new row group.
+    /// Create column writers for a new row group, with the given row group index
     pub fn create_column_writers(&self, row_group_index: usize) -> Result<Vec<ArrowColumnWriter>> {
         let mut writers = Vec::with_capacity(self.arrow_schema.fields.len());
         let mut leaves = self.schema.columns().iter();