From 7c6c1b49571d7d0d9765d4c690e5f7cbecada0e3 Mon Sep 17 00:00:00 2001 From: Axect Date: Fri, 4 Nov 2022 09:59:54 +0900 Subject: [PATCH 1/3] CHGE: Add compression options to write_parquet --- src/fuga/mod.rs | 3 +++ src/prelude/mod.rs | 12 +++++++++++- src/prelude/simpler.rs | 23 +++++++++++++++++++++++ src/structure/dataframe.rs | 6 +++--- 4 files changed, 40 insertions(+), 4 deletions(-) diff --git a/src/fuga/mod.rs b/src/fuga/mod.rs index da7c6afb..b7b6d769 100644 --- a/src/fuga/mod.rs +++ b/src/fuga/mod.rs @@ -175,3 +175,6 @@ pub use crate::structure::matrix::{ pub use crate::structure::dataframe::DType::*; pub use crate::structure::ad::AD::*; pub use crate::numerical::spline::SlopeMethod::{Akima, Quadratic}; + +#[cfg(feature="parquet")] +pub use arrow2::io::parquet::write::CompressionOptions; diff --git a/src/prelude/mod.rs b/src/prelude/mod.rs index 5d032541..b0e3e21b 100644 --- a/src/prelude/mod.rs +++ b/src/prelude/mod.rs @@ -129,9 +129,16 @@ pub use crate::structure::{ }, polynomial::{Polynomial,poly,Calculus,lagrange_polynomial,legendre_polynomial}, vector::*, - dataframe::*, + dataframe::{ + DataFrame, DType, DTypeArray, DTypeValue, Series, Scalar, TypedScalar, TypedVector + }, //complex::C64, }; +#[cfg(feature="csv")] +pub use crate::structure::dataframe::WithCSV; + +#[cfg(feature="nc")] +pub use crate::structure::dataframe::WithNetCDF; pub use simpler::{solve, SimplerLinearAlgebra}; @@ -156,3 +163,6 @@ pub use crate::numerical::{ }; pub use simpler::{eigen, integrate, chebyshev_polynomial, cubic_hermite_spline}; + +#[cfg(feature="parquet")] +pub use simpler::SimpleParquet; diff --git a/src/prelude/simpler.rs b/src/prelude/simpler.rs index 6abb223e..98716b4a 100644 --- a/src/prelude/simpler.rs +++ b/src/prelude/simpler.rs @@ -1,3 +1,5 @@ +use std::error::Error; +use arrow2::io::parquet::write::CompressionOptions; use crate::numerical::{ eigen, eigen::{Eigen, EigenMethod::Jacobi}, @@ -6,9 +8,12 @@ use crate::numerical::{ spline, spline::{CubicHermiteSpline, SlopeMethod::Quadratic}, }; +use crate::prelude::DataFrame; use crate::structure::matrix::{self, Matrix}; use crate::structure::polynomial; use crate::traits::math::{Norm, Normed}; +#[cfg(feature="parquet")] +use crate::structure::dataframe::WithParquet; /// Simple Norm pub trait SimpleNorm: Normed { @@ -142,4 +147,22 @@ pub fn chebyshev_polynomial(n: usize) -> polynomial::Polynomial { pub fn cubic_hermite_spline(node_x: &[f64], node_y: &[f64]) -> CubicHermiteSpline { spline::cubic_hermite_spline(node_x, node_y, Quadratic) +} + +/// Simple handle parquet +#[cfg(feature="parquet")] +pub trait SimpleParquet: Sized { + fn write_parquet(&self, path: &str) -> Result<(), Box>; + fn read_parquet(path: &str) -> Result>; +} + +#[cfg(feature="parquet")] +impl SimpleParquet for DataFrame { + fn write_parquet(&self, path: &str) -> Result<(), Box> { + WithParquet::write_parquet(self, path, CompressionOptions::Uncompressed) + } + + fn read_parquet(path: &str) -> Result> { + WithParquet::read_parquet(path) + } } \ No newline at end of file diff --git a/src/structure/dataframe.rs b/src/structure/dataframe.rs index 2ff7658b..61d13a5c 100644 --- a/src/structure/dataframe.rs +++ b/src/structure/dataframe.rs @@ -1834,7 +1834,7 @@ impl WithNetCDF for DataFrame { #[cfg(feature="parquet")] pub trait WithParquet { - fn write_parquet(&self, file_path: &str) -> Result<(), Box>; + fn write_parquet(&self, file_path: &str, compression: CompressionOptions) -> Result<(), Box>; fn read_parquet(file_path: &str) -> Result> where Self: Sized; // fn read_parquet_by_header(file_path: &str, header: Vec<&str>) -> Result> where Self: Sized; } @@ -1842,7 +1842,7 @@ pub trait WithParquet { #[cfg(feature="parquet")] impl WithParquet for DataFrame { /// Write DataFrame to parquet - fn write_parquet(&self, file_path: &str) -> Result<(), Box> { + fn write_parquet(&self, file_path: &str, compression: CompressionOptions) -> Result<(), Box> { let file = std::fs::File::create(file_path)?; let mut schema_vec = vec![]; @@ -1864,7 +1864,7 @@ impl WithParquet for DataFrame { let encodings = (0 .. l).map(|_| vec![Encoding::Plain]).collect::>(); let options = WriteOptions { write_statistics: true, - compression: CompressionOptions::Snappy, + compression, version: Version::V2, }; From 40a85a4171cae777186abb4ca910d5db8d7bc911 Mon Sep 17 00:00:00 2001 From: Axect Date: Fri, 4 Nov 2022 10:01:55 +0900 Subject: [PATCH 2/3] DOCS: Add compression options to example --- src/structure/dataframe.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/structure/dataframe.rs b/src/structure/dataframe.rs index 61d13a5c..6253a592 100644 --- a/src/structure/dataframe.rs +++ b/src/structure/dataframe.rs @@ -242,7 +242,7 @@ //! df.push("a", Series::new(vec!['x', 'y', 'z'])); //! df.push("b", Series::new(vec![0, 1, 2])); //! df.push("c", Series::new(c!(0.1, 0.2, 0.3))); -//! df.write_parquet("example_data/doc_pq.parquet")?; +//! df.write_parquet("example_data/doc_pq.parquet", CompressionOptions::Uncompressed)?; //! //! // Read parquet //! let mut dg = DataFrame::read_parquet("example_data/doc_pq.parquet")?; @@ -1832,6 +1832,7 @@ impl WithNetCDF for DataFrame { } } +/// To handle parquet format #[cfg(feature="parquet")] pub trait WithParquet { fn write_parquet(&self, file_path: &str, compression: CompressionOptions) -> Result<(), Box>; From c6adc5d5403964c84a18dd4d4231a2ca12ac2f45 Mon Sep 17 00:00:00 2001 From: Axect Date: Fri, 4 Nov 2022 10:15:57 +0900 Subject: [PATCH 3/3] RLSE: Ver 0.32.1 * `CompressionOptions` for `write_parquet` --- .gitignore | 1 + Cargo.toml | 2 +- RELEASES.md | 6 ++++++ src/fuga/mod.rs | 44 +++++++++++++++++++++++++++++++++++++++++++- src/prelude/mod.rs | 42 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 93 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 17bb5752..15b0ee6c 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ true/ *.parquet example_data/*.csv example_data/*.nc +example_data/*.parquet *.mm_profdata src/bin/* diff --git a/Cargo.toml b/Cargo.toml index 58c80617..0cd72060 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "peroxide" -version = "0.32.0" +version = "0.32.1" authors = ["axect "] edition = "2018" description = "Rust comprehensive scientific computation library contains linear algebra, numerical analysis, statistics and machine learning tools with farmiliar syntax" diff --git a/RELEASES.md b/RELEASES.md index fa7badbe..28db6a97 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -1,3 +1,9 @@ +# Release 0.32.1 (2022-11-04) + +* Make an option for choosing compression method for parquet + * At `fuga` : `fn write_parquet(&self, path: &str, compression: CompressionOptions)` + * At `prelude` : `fn write_parquet(&self, path:&str)` (Default: `CompressionOptions::Uncompressed`) + # Release 0.32.0 (2022-11-03) ## DataFrame meets Parquet diff --git a/src/fuga/mod.rs b/src/fuga/mod.rs index b7b6d769..f78e9a50 100644 --- a/src/fuga/mod.rs +++ b/src/fuga/mod.rs @@ -13,7 +13,7 @@ //! extern crate peroxide; //! use peroxide::fuga::*; //! -//! // Then you can use everyting in peroxide. +//! // Then you can use everything in peroxide. //! ``` //! //! # Compare with `prelude` @@ -103,6 +103,48 @@ //! a.solve(&b).print(); // [1, 1] //! } //! ``` +//! +//! * DataFrame with Parquet +//! +//! ``` +//! extern crate peroxide; +//! use peroxide::fuga::*; +//! +//! fn main() { +//! let x = seq(0, 1, 0.1); +//! let y = x.fmap(|t| t.powi(2)); +//! +//! let mut df = DataFrame::new(vec![]); +//! df.push("x", Series::new(x)); +//! df.push("y", Series::new(y)); +//! +//! df.print(); +//! +//! # #[cfg(feature="parquet")] { +//! df.write_parquet("example_data/test.parquet", CompressionOptions::Uncompressed).unwrap(); +//! # } +//! } +//! ``` +//! +//! ``` +//! extern crate peroxide; +//! use peroxide::prelude::*; +//! +//! fn main() { +//! let x = seq(0, 1, 0.1); +//! let y = x.fmap(|t| t.powi(2)); +//! +//! let mut df = DataFrame::new(vec![]); +//! df.push("x", Series::new(x)); +//! df.push("y", Series::new(y)); +//! +//! df.print(); +//! +//! # #[cfg(feature="parquet")] { +//! df.write_parquet("example_data/test.parquet").unwrap(); +//! # } +//! } +//! ``` #[allow(unused_imports)] pub use crate::macros::{julia_macro::*, matlab_macro::*, r_macro::*}; diff --git a/src/prelude/mod.rs b/src/prelude/mod.rs index b0e3e21b..26e7e892 100644 --- a/src/prelude/mod.rs +++ b/src/prelude/mod.rs @@ -99,6 +99,48 @@ //! a.solve(&b).print(); // [1, 1] //! } //! ``` +//! +//! * DataFrame with Parquet +//! +//! ``` +//! extern crate peroxide; +//! use peroxide::fuga::*; +//! +//! fn main() { +//! let x = seq(0, 1, 0.1); +//! let y = x.fmap(|t| t.powi(2)); +//! +//! let mut df = DataFrame::new(vec![]); +//! df.push("x", Series::new(x)); +//! df.push("y", Series::new(y)); +//! +//! df.print(); +//! +//! # #[cfg(feature="parquet")] { +//! df.write_parquet("example_data/test.parquet", CompressionOptions::Uncompressed).unwrap(); +//! # } +//! } +//! ``` +//! +//! ``` +//! extern crate peroxide; +//! use peroxide::prelude::*; +//! +//! fn main() { +//! let x = seq(0, 1, 0.1); +//! let y = x.fmap(|t| t.powi(2)); +//! +//! let mut df = DataFrame::new(vec![]); +//! df.push("x", Series::new(x)); +//! df.push("y", Series::new(y)); +//! +//! df.print(); +//! +//! # #[cfg(feature="parquet")] { +//! df.write_parquet("example_data/test.parquet").unwrap(); +//! # } +//! } +//! ``` #[allow(unused_imports)] pub use crate::macros::{julia_macro::*, matlab_macro::*, r_macro::*};