Skip to content

Commit 4f17f83

Browse files
committed
Improve module documentation for parquet crate
1 parent 9350e9d commit 4f17f83

File tree

3 files changed

+88
-73
lines changed

3 files changed

+88
-73
lines changed

parquet/src/arrow/async_reader.rs

Lines changed: 58 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,64 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
//! Contains asynchronous APIs for reading parquet files into
19-
//! arrow [`RecordBatch`]
18+
//! Provides `async` API for reading parquet files as
19+
//! [`RecordBatch`]es
20+
//!
21+
//! ```
22+
//! # #[tokio::main(flavor="current_thread")]
23+
//! # async fn main() {
24+
//! #
25+
//! use arrow::record_batch::RecordBatch;
26+
//! use arrow::util::pretty::pretty_format_batches;
27+
//! use futures::TryStreamExt;
28+
//! use tokio::fs::File;
29+
//!
30+
//! use parquet::arrow::ParquetRecordBatchStreamBuilder;
31+
//!
32+
//! # fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) {
33+
//! # let formatted = pretty_format_batches(batches).unwrap().to_string();
34+
//! # let actual_lines: Vec<_> = formatted.trim().lines().collect();
35+
//! # assert_eq!(
36+
//! # &actual_lines, expected_lines,
37+
//! # "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
38+
//! # expected_lines, actual_lines
39+
//! # );
40+
//! # }
41+
//!
42+
//! let testdata = arrow::util::test_util::parquet_test_data();
43+
//! let path = format!("{}/alltypes_plain.parquet", testdata);
44+
//! let file = tokio::fs::File::open(path).await.unwrap();
45+
//!
46+
//! let builder = ParquetRecordBatchStreamBuilder::new(file)
47+
//! .await
48+
//! .unwrap()
49+
//! .with_projection(vec![1, 2, 6])
50+
//! .with_batch_size(3);
51+
//!
52+
//! let stream = builder.build().unwrap();
53+
//!
54+
//! let results = stream.try_collect::<Vec<_>>().await.unwrap();
55+
//! assert_eq!(results.len(), 3);
56+
//!
57+
//! assert_batches_eq(
58+
//! &results,
59+
//! &[
60+
//! "+----------+-------------+-----------+",
61+
//! "| bool_col | tinyint_col | float_col |",
62+
//! "+----------+-------------+-----------+",
63+
//! "| true | 0 | 0 |",
64+
//! "| false | 1 | 1.1 |",
65+
//! "| true | 0 | 0 |",
66+
//! "| false | 1 | 1.1 |",
67+
//! "| true | 0 | 0 |",
68+
//! "| false | 1 | 1.1 |",
69+
//! "| true | 0 | 0 |",
70+
//! "| false | 1 | 1.1 |",
71+
//! "+----------+-------------+-----------+",
72+
//! ],
73+
//! );
74+
//! # }
75+
//! ```
2076
2177
use std::collections::VecDeque;
2278
use std::fmt::Formatter;
@@ -425,58 +481,3 @@ impl PageIterator for ColumnChunkIterator {
425481
Ok(self.column_schema.clone())
426482
}
427483
}
428-
429-
#[cfg(test)]
430-
mod tests {
431-
use arrow::util::pretty::pretty_format_batches;
432-
use futures::TryStreamExt;
433-
use tokio::fs::File;
434-
435-
use super::*;
436-
437-
fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) {
438-
let formatted = pretty_format_batches(batches).unwrap().to_string();
439-
let actual_lines: Vec<_> = formatted.trim().lines().collect();
440-
assert_eq!(
441-
&actual_lines, expected_lines,
442-
"\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
443-
expected_lines, actual_lines
444-
);
445-
}
446-
447-
#[tokio::test]
448-
async fn test_parquet_stream() {
449-
let testdata = arrow::util::test_util::parquet_test_data();
450-
let path = format!("{}/alltypes_plain.parquet", testdata);
451-
let file = File::open(path).await.unwrap();
452-
453-
let builder = ParquetRecordBatchStreamBuilder::new(file)
454-
.await
455-
.unwrap()
456-
.with_projection(vec![1, 2, 6])
457-
.with_batch_size(3);
458-
459-
let stream = builder.build().unwrap();
460-
461-
let results = stream.try_collect::<Vec<_>>().await.unwrap();
462-
assert_eq!(results.len(), 3);
463-
464-
assert_batches_eq(
465-
&results,
466-
&[
467-
"+----------+-------------+-----------+",
468-
"| bool_col | tinyint_col | float_col |",
469-
"+----------+-------------+-----------+",
470-
"| true | 0 | 0 |",
471-
"| false | 1 | 1.1 |",
472-
"| true | 0 | 0 |",
473-
"| false | 1 | 1.1 |",
474-
"| true | 0 | 0 |",
475-
"| false | 1 | 1.1 |",
476-
"| true | 0 | 0 |",
477-
"| false | 1 | 1.1 |",
478-
"+----------+-------------+-----------+",
479-
],
480-
);
481-
}
482-
}

parquet/src/arrow/mod.rs

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -15,42 +15,39 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
//! Provides API for reading/writing Arrow
19+
//! [RecordBatch](arrow::record_batch::RecordBatch)es and
20+
//! [Array](arrow::array::Arrays) to/from Parquet Files.
21+
//!
1822
//! [Apache Arrow](http://arrow.apache.org/) is a cross-language development platform for
1923
//! in-memory data.
2024
//!
21-
//! This mod provides API for converting between arrow and parquet.
22-
//!
2325
//!# Example of writing Arrow record batch to Parquet file
2426
//!
2527
//!```rust
26-
//! use arrow::array::Int32Array;
27-
//! use arrow::datatypes::{DataType, Field, Schema};
28+
//! use arrow::array::{Int32Array, ArrayRef};
2829
//! use arrow::record_batch::RecordBatch;
2930
//! use parquet::arrow::arrow_writer::ArrowWriter;
3031
//! use parquet::file::properties::WriterProperties;
3132
//! use std::fs::File;
3233
//! use std::sync::Arc;
3334
//! let ids = Int32Array::from(vec![1, 2, 3, 4]);
3435
//! let vals = Int32Array::from(vec![5, 6, 7, 8]);
35-
//! let schema = Arc::new(Schema::new(vec![
36-
//! Field::new("id", DataType::Int32, false),
37-
//! Field::new("val", DataType::Int32, false),
38-
//! ]));
36+
//! let batch = RecordBatch::try_from_iter(vec![
37+
//! ("id", Arc::new(ids) as ArrayRef),
38+
//! ("val", Arc::new(vals) as ArrayRef),
39+
//! ]).unwrap();
3940
//!
4041
//! let file = File::create("data.parquet").unwrap();
4142
//!
42-
//! let batch =
43-
//! RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(ids), Arc::new(vals)]).unwrap();
44-
//! let batches = vec![batch];
45-
//!
4643
//! // Default writer properties
4744
//! let props = WriterProperties::builder().build();
4845
//!
49-
//! let mut writer = ArrowWriter::try_new(file, Arc::clone(&schema), Some(props)).unwrap();
46+
//! let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(props)).unwrap();
5047
//!
51-
//! for batch in batches {
52-
//! writer.write(&batch).expect("Writing batch");
53-
//! }
48+
//! writer.write(&batch).expect("Writing batch");
49+
//!
50+
//! // writer must be closed to write footer
5451
//! writer.close().unwrap();
5552
//! ```
5653
//!
@@ -134,6 +131,8 @@ experimental_mod!(schema);
134131
pub use self::arrow_reader::ArrowReader;
135132
pub use self::arrow_reader::ParquetFileArrowReader;
136133
pub use self::arrow_writer::ArrowWriter;
134+
#[cfg(feature = "async")]
135+
pub use self::async_reader::ParquetRecordBatchStreamBuilder;
137136

138137
pub use self::schema::{
139138
arrow_to_parquet_schema, parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns,

parquet/src/lib.rs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,21 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
//! This crate contains the official Native Rust implementation of
19+
//! [Apache Parquet](https://parquet.apache.org/), part of
20+
//! the [Apache Arrow](https://arrow.apache.org/) project.
21+
//!
22+
//! # Getting Started
23+
//! Start with some examples:
24+
//!
25+
//! 1. [mod@file] for reading and writing parquet files using the
26+
//! [ColumnReader](column::reader::ColumnReader) API.
27+
//!
28+
//! 2. [arrow] for reading and writing parquet files to Arrow
29+
//! `RecordBatch`es
30+
//!
31+
//! 3. [arrow::async_reader] for `async` reading and writing parquet
32+
//! files to Arrow `RecordBatch`es (requires the `async` feature).
1833
#![allow(incomplete_features)]
1934
#![allow(dead_code)]
2035
#![allow(non_camel_case_types)]

0 commit comments

Comments
 (0)