diff --git a/.travis.yml b/.travis.yml index cf6cc9b9..9c360970 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,17 +1,3 @@ -#language: rust -#rust: -# - 1.9.0 -# - stable -# - beta -# - nightly -#script: -# - cargo build --verbose -# - cargo doc -# - cargo test --verbose -# - if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then -# cargo bench --verbose; -# fi - language: rust cache: cargo @@ -33,6 +19,13 @@ matrix: - os: linux rust: stable env: TARGET=x86_64-unknown-linux-musl + # Minimum Rust supported channel. + - os: linux + rust: 1.15.0 + env: TARGET=x86_64-unknown-linux-gnu + - os: linux + rust: 1.15.0 + env: TARGET=x86_64-unknown-linux-musl before_install: - export PATH="$PATH:$HOME/.cargo/bin" diff --git a/BENCHMARKS.md b/BENCHMARKS.md index 1071db95..7326a49f 100644 --- a/BENCHMARKS.md +++ b/BENCHMARKS.md @@ -6,27 +6,27 @@ These benchmarks were run with which is a random 1,000,000 row subset of the world city population dataset from the [Data Science Toolkit](https://github.com/petewarden/dstkdata). -These benchmarks were run on an Intel i3930K (6 CPUs, 12 threads) with 32GB of -memory. +These benchmarks were run on an Intel i7-6900K (8 CPUs, 16 threads) with 64GB +of memory. ``` -count 0.28 seconds 162.54 MB/sec -flatten 5.31 seconds 8.57 MB/sec -flatten_condensed 5.39 seconds 8.44 MB/sec -frequency 2.54 seconds 17.91 MB/sec -index 0.27 seconds 168.56 MB/sec -sample_10 0.47 seconds 96.83 MB/sec -sample_1000 0.49 seconds 92.88 MB/sec -sample_100000 0.62 seconds 73.40 MB/sec -search 0.71 seconds 64.10 MB/sec -select 0.47 seconds 96.83 MB/sec -sort 3.36 seconds 13.54 MB/sec -slice_one_middle 0.22 seconds 206.88 MB/sec -slice_one_middle_index 0.01 seconds 4551.36 MB/sec -stats 1.37 seconds 33.22 MB/sec -stats_index 0.23 seconds 197.88 MB/sec -stats_everything 3.90 seconds 11.67 MB/sec -stats_everything_index 2.58 seconds 17.64 MB/sec +count 0.11 seconds 413.76 MB/sec +flatten 4.54 seconds 10.02 MB/sec +flatten_condensed 4.45 seconds 10.22 MB/sec +frequency 1.82 seconds 25.00 MB/sec +index 0.12 seconds 379.28 MB/sec +sample_10 0.18 seconds 252.85 MB/sec +sample_1000 0.18 seconds 252.85 MB/sec +sample_100000 0.29 seconds 156.94 MB/sec +search 0.27 seconds 168.56 MB/sec +select 0.14 seconds 325.09 MB/sec +sort 2.18 seconds 20.87 MB/sec +slice_one_middle 0.08 seconds 568.92 MB/sec +slice_one_middle_index 0.01 seconds 4551.36 MB/sec +stats 1.09 seconds 41.75 MB/sec +stats_index 0.15 seconds 303.42 MB/sec +stats_everything 1.94 seconds 23.46 MB/sec +stats_everything_index 0.93 seconds 48.93 MB/sec ``` ### Details @@ -39,4 +39,3 @@ The `count` command can be viewed as a sort of baseline of the fastest possible command that parses every record in CSV data. The benchmarks that end with `_index` are run with indexing enabled. - diff --git a/Cargo.lock b/Cargo.lock index 51f8ca5c..78154dff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,34 +1,37 @@ [root] name = "xsv" -version = "0.11.0" +version = "0.12.0" dependencies = [ - "byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "chan 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)", - "csv 0.14.7 (registry+https://github.com/rust-lang/crates.io-index)", - "docopt 0.6.86 (registry+https://github.com/rust-lang/crates.io-index)", + "csv 1.0.0-beta.1 (registry+https://github.com/rust-lang/crates.io-index)", + "csv-index 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "docopt 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", "filetime 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", "log 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", - "num_cpus 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", - "quickcheck 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", + "num_cpus 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "quickcheck 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", "rand 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc-serialize 0.3.23 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-serialize 0.3.24 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", "streaming-stats 0.1.28 (registry+https://github.com/rust-lang/crates.io-index)", - "tabwriter 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)", + "tabwriter 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)", "threadpool 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "aho-corasick" -version = "0.5.3" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "byteorder" -version = "0.5.3" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -41,31 +44,40 @@ dependencies = [ [[package]] name = "csv" -version = "0.14.7" +version = "1.0.0-beta.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc-serialize 0.3.23 (registry+https://github.com/rust-lang/crates.io-index)", + "csv-core 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] -name = "docopt" -version = "0.6.86" +name = "csv-core" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "lazy_static 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc-serialize 0.3.23 (registry+https://github.com/rust-lang/crates.io-index)", - "strsim 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] -name = "env_logger" -version = "0.3.5" +name = "csv-index" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "log 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", - "regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)", + "byteorder 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "csv 1.0.0-beta.1 (registry+https://github.com/rust-lang/crates.io-index)", + "csv-core 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "docopt" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-serialize 0.3.24 (registry+https://github.com/rust-lang/crates.io-index)", + "strsim 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -73,7 +85,7 @@ name = "filetime" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.23 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -87,12 +99,12 @@ dependencies = [ [[package]] name = "lazy_static" -version = "0.2.4" +version = "0.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "libc" -version = "0.2.21" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -102,10 +114,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "memchr" -version = "0.1.11" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.23 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -114,8 +126,8 @@ version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "num-bigint 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)", - "num-complex 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)", - "num-integer 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)", + "num-complex 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)", + "num-integer 0.1.34 (registry+https://github.com/rust-lang/crates.io-index)", "num-iter 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)", "num-rational 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)", "num-traits 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)", @@ -126,24 +138,24 @@ name = "num-bigint" version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "num-integer 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)", + "num-integer 0.1.34 (registry+https://github.com/rust-lang/crates.io-index)", "num-traits 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)", "rand 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc-serialize 0.3.23 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-serialize 0.3.24 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "num-complex" -version = "0.1.36" +version = "0.1.37" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "num-traits 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc-serialize 0.3.23 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-serialize 0.3.24 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "num-integer" -version = "0.1.33" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "num-traits 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)", @@ -154,7 +166,7 @@ name = "num-iter" version = "0.1.33" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "num-integer 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)", + "num-integer 0.1.34 (registry+https://github.com/rust-lang/crates.io-index)", "num-traits 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -164,9 +176,9 @@ version = "0.1.36" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "num-bigint 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)", - "num-integer 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)", + "num-integer 0.1.34 (registry+https://github.com/rust-lang/crates.io-index)", "num-traits 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)", - "rustc-serialize 0.3.23 (registry+https://github.com/rust-lang/crates.io-index)", + "rustc-serialize 0.3.24 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -176,52 +188,79 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "num_cpus" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.23 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "quickcheck" -version = "0.3.2" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", - "log 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", "rand 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "quote" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "rand" version = "0.3.15" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.23 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "regex" -version = "0.1.80" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", - "memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)", - "regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)", - "thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)", - "utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "aho-corasick 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "regex-syntax" -version = "0.3.9" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "rustc-serialize" -version = "0.3.23" +version = "0.3.24" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde" +version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "serde_derive" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "quote 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive_internals 0.15.0 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.11.11 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "serde_derive_internals" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "syn 0.11.11 (registry+https://github.com/rust-lang/crates.io-index)", + "synom 0.11.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "streaming-stats" version = "0.1.28" @@ -232,12 +271,30 @@ dependencies = [ [[package]] name = "strsim" -version = "0.5.2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "syn" +version = "0.11.11" source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "quote 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)", + "synom 0.11.3 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.0.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "synom" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.0.4 (registry+https://github.com/rust-lang/crates.io-index)", +] [[package]] name = "tabwriter" -version = "0.1.25" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "unicode-width 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)", @@ -245,19 +302,20 @@ dependencies = [ [[package]] name = "thread-id" -version = "2.0.0" +version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", - "libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.23 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] name = "thread_local" -version = "0.2.7" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "thread-id 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -270,9 +328,27 @@ name = "unicode-width" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "unicode-xid" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "unreachable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "utf8-ranges" -version = "0.1.3" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "void" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -286,38 +362,48 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" [metadata] -"checksum aho-corasick 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ca972c2ea5f742bfce5687b9aef75506a764f61d37f8f649047846a9686ddb66" -"checksum byteorder 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "0fc10e8cc6b2580fda3f36eb6dc5316657f812a3df879a44a66fc9f0fdbc4855" +"checksum aho-corasick 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "500909c4f87a9e52355b26626d890833e9e1d53ac566db76c36faa984b889699" +"checksum byteorder 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c40977b0ee6b9885c9013cd41d9feffdd22deb3bb4dc3a71d901cc7a77de18c8" "checksum chan 0.1.19 (registry+https://github.com/rust-lang/crates.io-index)" = "f93bfe971116428a9066c1c3c69a09ae3ef69432f8418be28ab50f96783e6a50" -"checksum csv 0.14.7 (registry+https://github.com/rust-lang/crates.io-index)" = "266c1815d7ca63a5bd86284043faf91e8c95e943e55ce05dc0ae08e952de18bc" -"checksum docopt 0.6.86 (registry+https://github.com/rust-lang/crates.io-index)" = "4a7ef30445607f6fc8720f0a0a2c7442284b629cf0d049286860fae23e71c4d9" -"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" +"checksum csv 1.0.0-beta.1 (registry+https://github.com/rust-lang/crates.io-index)" = "81675dd89651e2aa0989e6a5249dc5c5bcfc13772baec7f9652208e7691e0955" +"checksum csv-core 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6a11ab3094dd197341f9d66753789a5cdf29ce35450a7d6e7968024e2d44519c" +"checksum csv-index 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7405ccdb151a01a4844b2cf851e2806dca2bdec892537057bf0719f4a7504c60" +"checksum docopt 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ab32ea6e284d87987066f21a9e809a73c14720571ef34516f0890b3d355ccfd8" "checksum filetime 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "5363ab8e4139b8568a6237db5248646e5a8a2f89bd5ccb02092182b11fd3e922" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" -"checksum lazy_static 0.2.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7291b1dd97d331f752620b02dfdbc231df7fc01bf282a00769e1cdb963c460dc" -"checksum libc 0.2.21 (registry+https://github.com/rust-lang/crates.io-index)" = "88ee81885f9f04bff991e306fea7c1c60a5f0f9e409e99f6b40e3311a3363135" +"checksum lazy_static 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "3b37545ab726dd833ec6420aaba8231c5b320814b9029ad585555d2a03e94fbf" +"checksum libc 0.2.23 (registry+https://github.com/rust-lang/crates.io-index)" = "e7eb6b826bfc1fdea7935d46556250d1799b7fe2d9f7951071f4291710665e3e" "checksum log 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "5141eca02775a762cc6cd564d8d2c50f67c0ea3a372cbf1c51592b3e029e10ad" -"checksum memchr 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d8b629fb514376c675b98c1421e80b151d3817ac42d7c667717d282761418d20" +"checksum memchr 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1dbccc0e46f1ea47b9f17e6d67c5a96bd27030519c519c9c91327e31275a47b4" "checksum num 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "98b15ba84e910ea7a1973bccd3df7b31ae282bf9d8bd2897779950c9b8303d40" "checksum num-bigint 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "ba6d838b16e56da1b6c383d065ff1ec3c7d7797f65a3e8f6ba7092fd87820bac" -"checksum num-complex 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "3534898d8a1f6b16c12f9fc2f4eaabc7ecdcc55f267213caa8988fdc7d60ff94" -"checksum num-integer 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)" = "21e4df1098d1d797d27ef0c69c178c3fab64941559b290fcae198e0825c9c8b5" +"checksum num-complex 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "148eb324ca772230853418731ffdf13531738b50f89b30692a01fcdcb0a64677" +"checksum num-integer 0.1.34 (registry+https://github.com/rust-lang/crates.io-index)" = "ef1a4bf6f9174aa5783a9b4cc892cacd11aebad6c69ad027a0b65c6ca5f8aa37" "checksum num-iter 0.1.33 (registry+https://github.com/rust-lang/crates.io-index)" = "f7d1891bd7b936f12349b7d1403761c8a0b85a18b148e9da4429d5d102c1a41e" "checksum num-rational 0.1.36 (registry+https://github.com/rust-lang/crates.io-index)" = "c2dc5ea04020a8f18318ae485c751f8cfa1c0e69dcf465c29ddaaa64a313cc44" "checksum num-traits 0.1.37 (registry+https://github.com/rust-lang/crates.io-index)" = "e1cbfa3781f3fe73dc05321bed52a06d2d491eaa764c52335cf4399f046ece99" -"checksum num_cpus 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a18c392466409c50b87369414a2680c93e739aedeb498eb2bff7d7eb569744e2" -"checksum quickcheck 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "0b333da40686cc05db13d933f8e7b450f403cfc5a4d005154d8d4a5ba9d14605" +"checksum num_cpus 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ca313f1862c7ec3e0dfe8ace9fa91b1d9cb5c84ace3d00f5ec4216238e93c167" +"checksum quickcheck 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "02c2411d418cea2364325b18a205664f9ef8252e06b2e911db97c0b0d98b1406" +"checksum quote 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e920b65c65f10b2ae65c831a81a073a89edd28c7cce89475bff467ab4167a" "checksum rand 0.3.15 (registry+https://github.com/rust-lang/crates.io-index)" = "022e0636ec2519ddae48154b028864bdce4eaf7d35226ab8e65c611be97b189d" -"checksum regex 0.1.80 (registry+https://github.com/rust-lang/crates.io-index)" = "4fd4ace6a8cf7860714a2c2280d6c1f7e6a413486c13298bbc86fd3da019402f" -"checksum regex-syntax 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "f9ec002c35e86791825ed294b50008eea9ddfc8def4420124fbc6b08db834957" -"checksum rustc-serialize 0.3.23 (registry+https://github.com/rust-lang/crates.io-index)" = "684ce48436d6465300c9ea783b6b14c4361d6b8dcbb1375b486a69cc19e2dfb0" +"checksum regex 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "1731164734096285ec2a5ec7fea5248ae2f5485b3feeb0115af4fda2183b2d1b" +"checksum regex-syntax 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad890a5eef7953f55427c50575c680c42841653abd2b028b68cd223d157f62db" +"checksum rustc-serialize 0.3.24 (registry+https://github.com/rust-lang/crates.io-index)" = "dcf128d1287d2ea9d80910b5f1120d0b8eede3fbf1abe91c40d39ea7d51e6fda" +"checksum serde 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "c0c3d79316a6051231925504f6ef893d45088e8823c77a8331a3dcf427ee9087" +"checksum serde_derive 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)" = "0019cd5b9f0529a1a0e145a912e9a2d60c325c58f7f260fc36c71976e9d76aee" +"checksum serde_derive_internals 0.15.0 (registry+https://github.com/rust-lang/crates.io-index)" = "021c338d22c7e30f957a6ab7e388cb6098499dda9fd4ba1661ee074ca7a180d1" "checksum streaming-stats 0.1.28 (registry+https://github.com/rust-lang/crates.io-index)" = "f13d0cd680e11a62c5e125d9799debfb39fcfff9a2ef416336ce748f65018b89" -"checksum strsim 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "67f84c44fbb2f91db7fef94554e6b2ac05909c9c0b0bc23bb98d3a1aebfe7f7c" -"checksum tabwriter 0.1.25 (registry+https://github.com/rust-lang/crates.io-index)" = "85dbf563da2891d55ef4b00ef08c5b5f160143f67691ff1f97ad89e77824ed3b" -"checksum thread-id 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a9539db560102d1cef46b8b78ce737ff0bb64e7e18d35b2a5688f7d097d0ff03" -"checksum thread_local 0.2.7 (registry+https://github.com/rust-lang/crates.io-index)" = "8576dbbfcaef9641452d5cf0df9b0e7eeab7694956dd33bb61515fb8f18cfdd5" +"checksum strsim 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b4d15c810519a91cf877e7e36e63fe068815c678181439f2f29e2562147c3694" +"checksum syn 0.11.11 (registry+https://github.com/rust-lang/crates.io-index)" = "d3b891b9015c88c576343b9b3e41c2c11a51c219ef067b264bd9c8aa9b441dad" +"checksum synom 0.11.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a393066ed9010ebaed60b9eafa373d4b1baac186dd7e008555b0f702b51945b6" +"checksum tabwriter 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3b7810162bc0a2eb2dc9a9bfd16ddb2d1f6022df3236d1478937bfadcb12385e" +"checksum thread-id 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8df7875b676fddfadffd96deea3b1124e5ede707d4884248931077518cf1f773" +"checksum thread_local 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "c85048c6260d17cf486ceae3282d9fb6b90be220bf5b28c400f5485ffc29f0c7" "checksum threadpool 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "59f6d3eff89920113dac9db44dde461d71d01e88a5b57b258a0466c32b5d7fe1" "checksum unicode-width 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "bf3a113775714a22dcb774d8ea3655c53a32debae63a063acc00a91cc586245f" -"checksum utf8-ranges 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "a1ca13c08c41c9c3e04224ed9ff80461d97e121589ff27c753a16cb10830ae0f" +"checksum unicode-xid 0.0.4 (registry+https://github.com/rust-lang/crates.io-index)" = "8c1f860d7d29cf02cb2f3f359fd35991af3d30bac52c57d265a3c461074cb4dc" +"checksum unreachable 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "1f2ae5ddb18e1c92664717616dd9549dde73f539f01bd7b77c2edb2446bdff91" +"checksum utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "662fab6525a98beff2921d7f61a39e7d59e0b425ebc7d0d9e66d316e55124122" +"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" "checksum winapi 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)" = "167dc9d6949a9b857f3451275e911c3f44255842c1f7a76f33c55103a909087a" "checksum winapi-build 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2d315eee3b34aca4797b2da6b13ed88266e6d612562a0c46390af8299fc699bc" diff --git a/Cargo.toml b/Cargo.toml index 6d36b620..b9e386d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "xsv" -version = "0.11.0" #:version +version = "0.12.0" #:version authors = ["Andrew Gallant "] description = "A high performance CSV command line toolkit." documentation = "http://burntsushi.net/rustdoc/xsv/" @@ -26,19 +26,22 @@ opt-level = 3 opt-level = 3 [dependencies] -byteorder = "0.5" +byteorder = "1" chan = "0.1" -csv = "0.14" -docopt = "0.6" +csv = "1.0.0-beta.1" +csv-index = "0.1" +docopt = "0.7" filetime = "0.1" -num_cpus = "1.0" -rand = "0.3" -regex = "0.1" +num_cpus = "1.4" +rand = "0.3.15" +regex = "0.2" rustc-serialize = "0.3" streaming-stats = "0.1" -tabwriter = "0.1" +tabwriter = "1" threadpool = "1.3" [dev-dependencies] -quickcheck = "0.3" +quickcheck = { version = "0.4", default-features = false } log = "0.3" +serde = "1" +serde_derive = "1" diff --git a/README.md b/README.md index a0eb72dc..e444c2ae 100644 --- a/README.md +++ b/README.md @@ -300,7 +300,7 @@ right and full outer join support too. Binaries for Windows, Linux and Mac are available [from Github](https://github.com/BurntSushi/xsv/releases/latest). If you're a **Mac OS X Homebrew** user, then you can install xsv -from homebrew-core, (compiled with rust stable, no SIMD): +from homebrew-core: ``` $ brew install xsv @@ -309,7 +309,13 @@ $ brew install xsv Alternatively, you can compile from source by [installing Cargo](https://crates.io/install) ([Rust's](http://www.rust-lang.org/) package manager) -and building `xsv`: +and installing `xsv` using Cargo: + +```bash +cargo install xsv +``` + +Compiling from this repository also works similarly: ```bash git clone git://github.com/BurntSushi/xsv @@ -317,7 +323,7 @@ cd xsv cargo build --release ``` -Compilation will probably take 1-2 minutes depending on your machine. The +Compilation will probably take a few minutes depending on your machine. The binary will end up in `./target/release/xsv`. diff --git a/src/cmd/cat.rs b/src/cmd/cat.rs index d9f3bd86..7e7c1f3f 100644 --- a/src/cmd/cat.rs +++ b/src/cmd/cat.rs @@ -1,5 +1,3 @@ -use std::iter::repeat; - use csv; use CliResult; @@ -52,8 +50,7 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); - + let args: Args = util::get_args(USAGE, argv)?; if args.cmd_rows { args.cat_rows() } else if args.cmd_columns { @@ -72,57 +69,53 @@ impl Args { } fn cat_rows(&self) -> CliResult<()> { - let mut wtr = try!(Config::new(&self.flag_output).writer()); - for (i, conf) in try!(self.configs()).into_iter().enumerate() { - let mut rdr = try!(conf.reader()); + let mut wtr = Config::new(&self.flag_output).writer()?; + for (i, conf) in self.configs()?.into_iter().enumerate() { + let mut rdr = conf.reader()?; if i == 0 { - try!(conf.write_headers(&mut rdr, &mut wtr)); + conf.write_headers(&mut rdr, &mut wtr)?; } for r in rdr.byte_records() { - try!(wtr.write(try!(r).into_iter())); + wtr.write_record(&r?)?; } } wtr.flush().map_err(From::from) } fn cat_columns(&self) -> CliResult<()> { - let mut wtr = try!(Config::new(&self.flag_output).writer()); - let mut rdrs = try!(try!(self.configs()) - .into_iter() - .map(|conf| conf.no_headers(true).reader()) - .collect::, _>>()); + let mut wtr = Config::new(&self.flag_output).writer()?; + let mut rdrs = self.configs()? + .into_iter() + .map(|conf| conf.no_headers(true).reader()) + .collect::, _>>()?; // Find the lengths of each record. If a length varies, then an error // will occur so we can rely on the first length being the correct one. - let mut lengths = vec!(); + let mut lengths = vec![]; for rdr in &mut rdrs { - lengths.push(try!(rdr.byte_headers()).len()); + lengths.push(rdr.byte_headers()?.len()); } let mut iters = rdrs.iter_mut() .map(|rdr| rdr.byte_records()) .collect::>(); 'OUTER: loop { - let mut records: Vec> = vec!(); + let mut record = csv::ByteRecord::new(); let mut num_done = 0; for (iter, &len) in iters.iter_mut().zip(lengths.iter()) { match iter.next() { None => { num_done += 1; if self.flag_pad { - // This can probably be optimized by - // pre-allocating. It would avoid the intermediate - // `Vec`. - records.push( - repeat(util::empty_field()) - .take(len) - .collect()); + for _ in 0..len { + record.push_field(b""); + } } else { break 'OUTER; } } Some(Err(err)) => return fail!(err), - Some(Ok(next)) => records.push(next), + Some(Ok(next)) => record.extend(&next), } } // Only needed when `--pad` is set. @@ -131,7 +124,7 @@ impl Args { if num_done >= iters.len() { break 'OUTER; } - try!(wtr.write(records.concat().into_iter())); + wtr.write_record(&record)?; } wtr.flush().map_err(From::from) } diff --git a/src/cmd/count.rs b/src/cmd/count.rs index 07138e4b..9419a956 100644 --- a/src/cmd/count.rs +++ b/src/cmd/count.rs @@ -1,4 +1,4 @@ -use csv::NextField; +use csv; use CliResult; use config::{Delimiter, Config}; @@ -29,30 +29,22 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); + let args: Args = util::get_args(USAGE, argv)?; let conf = Config::new(&args.arg_input) - .delimiter(args.flag_delimiter) - .no_headers(args.flag_no_headers); + .delimiter(args.flag_delimiter) + .no_headers(args.flag_no_headers); let count = - match try!(conf.indexed()) { + match conf.indexed()? { Some(idx) => idx.count(), None => { - let mut rdr = try!(conf.reader()); + let mut rdr = conf.reader()?; let mut count = 0u64; - loop { - match rdr.next_bytes() { - NextField::EndOfCsv => break, - NextField::EndOfRecord => { count += 1; } - NextField::Error(err) => return fail!(err), - NextField::Data(_) => {} - } - } - if !conf.no_headers && count > 0 { - count - 1 - } else { - count + let mut record = csv::ByteRecord::new(); + while rdr.read_byte_record(&mut record)? { + count += 1; } + count } }; Ok(println!("{}", count)) diff --git a/src/cmd/fixlengths.rs b/src/cmd/fixlengths.rs index 9389a25e..eb176bb7 100644 --- a/src/cmd/fixlengths.rs +++ b/src/cmd/fixlengths.rs @@ -1,5 +1,7 @@ use std::cmp; +use csv; + use CliResult; use config::{Config, Delimiter}; use util; @@ -40,11 +42,11 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); + let args: Args = util::get_args(USAGE, argv)?; let config = Config::new(&args.arg_input) - .delimiter(args.flag_delimiter) - .no_headers(true) - .flexible(true); + .delimiter(args.flag_delimiter) + .no_headers(true) + .flexible(true); let length = match args.flag_length { Some(length) => { if length == 0 { @@ -58,19 +60,15 @@ pub fn run(argv: &[&str]) -> CliResult<()> { Please specify a file path."); } let mut maxlen = 0usize; - let mut rdr = try!(config.reader()); - while !rdr.done() { - let mut index = 0usize; - let mut nonempty_count = 0usize; - loop { - match rdr.next_bytes().into_iter_result() { - None => break, - Some(r) => { - index += 1; - if index == 1 || try!(r).len() > 0 { - nonempty_count = index; - } - } + let mut rdr = config.reader()?; + let mut record = csv::ByteRecord::new(); + while rdr.read_byte_record(&mut record)? { + let mut index = 0; + let mut nonempty_count = 0; + for field in &record { + index += 1; + if index == 1 || !field.is_empty() { + nonempty_count = index; } } maxlen = cmp::max(maxlen, nonempty_count); @@ -79,19 +77,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> { } }; - let mut rdr = try!(config.reader()); - let mut wtr = try!(Config::new(&args.flag_output).writer()); + let mut rdr = config.reader()?; + let mut wtr = Config::new(&args.flag_output).writer()?; for r in rdr.byte_records() { - let mut r = try!(r); + let mut r = r?; if length >= r.len() { for _ in r.len()..length { - r.push(util::empty_field()); + r.push_field(b""); } } else { r.truncate(length); } - try!(wtr.write(r.into_iter())); + wtr.write_record(&r)?; } - try!(wtr.flush()); + wtr.flush()?; Ok(()) } diff --git a/src/cmd/flatten.rs b/src/cmd/flatten.rs index 918064c5..12414e76 100644 --- a/src/cmd/flatten.rs +++ b/src/cmd/flatten.rs @@ -48,33 +48,33 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); + let args: Args = util::get_args(USAGE, argv)?; let rconfig = Config::new(&args.arg_input) - .delimiter(args.flag_delimiter) - .no_headers(args.flag_no_headers); - let mut rdr = try!(rconfig.reader()); - let headers = try!(rdr.byte_headers()); + .delimiter(args.flag_delimiter) + .no_headers(args.flag_no_headers); + let mut rdr = rconfig.reader()?; + let headers = rdr.byte_headers()?.clone(); let mut wtr = TabWriter::new(io::stdout()); let mut first = true; for r in rdr.byte_records() { if !first && !args.flag_separator.is_empty() { - try!(writeln!(&mut wtr, "{}", args.flag_separator)); + writeln!(&mut wtr, "{}", args.flag_separator)?; } first = false; - let r = try!(r).into_iter(); - for (i, (header, field)) in headers.iter().zip(r).enumerate() { + let r = r?; + for (i, (header, field)) in headers.iter().zip(&r).enumerate() { if rconfig.no_headers { - try!(write!(&mut wtr, "{}", i)); + write!(&mut wtr, "{}", i)?; } else { - try!(wtr.write_all(&header)); + wtr.write_all(&header)?; } - try!(wtr.write_all(b"\t")); - try!(wtr.write_all(&*util::condense(Cow::Borrowed(&*field), - args.flag_condense))); - try!(wtr.write_all(b"\n")); + wtr.write_all(b"\t")?; + wtr.write_all(&*util::condense( + Cow::Borrowed(&*field), args.flag_condense))?; + wtr.write_all(b"\n")?; } } - try!(wtr.flush()); + wtr.flush()?; Ok(()) } diff --git a/src/cmd/fmt.rs b/src/cmd/fmt.rs index 8ca89fca..4bd70701 100644 --- a/src/cmd/fmt.rs +++ b/src/cmd/fmt.rs @@ -47,31 +47,34 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); + let args: Args = util::get_args(USAGE, argv)?; let rconfig = Config::new(&args.arg_input) - .delimiter(args.flag_delimiter) - .no_headers(true); - let wconfig = Config::new(&args.flag_output) - .delimiter(args.flag_out_delimiter) - .crlf(args.flag_crlf); - let mut rdr = try!(rconfig.reader()); - let mut wtr = try!(wconfig.writer()); + .delimiter(args.flag_delimiter) + .no_headers(true); + let mut wconfig = Config::new(&args.flag_output) + .delimiter(args.flag_out_delimiter) + .crlf(args.flag_crlf); if args.flag_ascii { - wtr = wtr.delimiter(b'\x1f') - .record_terminator(csv::RecordTerminator::Any(b'\x1e')); + wconfig = wconfig + .delimiter(Some(Delimiter(b'\x1f'))) + .terminator(csv::Terminator::Any(b'\x1e')); } if args.flag_quote_always { - wtr = wtr.quote_style(csv::QuoteStyle::Always); + wconfig = wconfig.quote_style(csv::QuoteStyle::Always); } if let Some(escape) = args.flag_escape { - wtr = wtr.escape(escape.as_byte()).double_quote(false); + wconfig = wconfig.escape(escape.as_byte()).double_quote(false); } - wtr = wtr.quote(args.flag_quote.as_byte()); + wconfig = wconfig.quote(args.flag_quote.as_byte()); + + + let mut rdr = rconfig.reader()?; + let mut wtr = wconfig.writer()?; for r in rdr.byte_records() { - try!(wtr.write(try!(r).into_iter())); + wtr.write_record(&r?)?; } - try!(wtr.flush()); + wtr.flush()?; Ok(()) } diff --git a/src/cmd/frequency.rs b/src/cmd/frequency.rs index 38a3902c..ac4672f3 100644 --- a/src/cmd/frequency.rs +++ b/src/cmd/frequency.rs @@ -2,13 +2,13 @@ use std::fs; use std::io; use chan; -use csv::{self, ByteString}; -use csv::index::Indexed; +use csv; use stats::{Frequencies, merge_all}; use threadpool::ThreadPool; use CliResult; use config::{Config, Delimiter}; +use index::Indexed; use select::{SelectColumns, Selection}; use util; @@ -74,41 +74,42 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); + let args: Args = util::get_args(USAGE, argv)?; let rconfig = args.rconfig(); - let mut wtr = try!(Config::new(&args.flag_output).writer()); - let (headers, tables) = try!(match try!(args.rconfig().indexed()) { + let mut wtr = Config::new(&args.flag_output).writer()?; + let (headers, tables) = match args.rconfig().indexed()? { Some(ref mut idx) if args.njobs() > 1 => args.parallel_ftables(idx), _ => args.sequential_ftables(), - }); + }?; - try!(wtr.write(vec!["field", "value", "count"].into_iter())); + wtr.write_record(vec!["field", "value", "count"])?; let head_ftables = headers.into_iter().zip(tables.into_iter()); - for (i, (mut header, ftab)) in head_ftables.enumerate() { + for (i, (header, ftab)) in head_ftables.enumerate() { + let mut header = header.to_vec(); if rconfig.no_headers { header = (i+1).to_string().into_bytes(); } for (value, count) in args.counts(&ftab).into_iter() { let count = count.to_string(); let row = vec![&*header, &*value, count.as_bytes()]; - try!(wtr.write(row.into_iter())); + wtr.write_record(row)?; } } Ok(()) } -type ByteRow = Vec; -type Headers = ByteRow; -type FTable = Frequencies; -type FTables = Vec>; +type ByteString = Vec; +type Headers = csv::ByteRecord; +type FTable = Frequencies>; +type FTables = Vec>>; impl Args { fn rconfig(&self) -> Config { Config::new(&self.arg_input) - .delimiter(self.flag_delimiter) - .no_headers(self.flag_no_headers) - .select(self.flag_select.clone()) + .delimiter(self.flag_delimiter) + .no_headers(self.flag_no_headers) + .select(self.flag_select.clone()) } fn counts(&self, ftab: &FTable) -> Vec<(ByteString, u64)> { @@ -130,15 +131,15 @@ impl Args { } fn sequential_ftables(&self) -> CliResult<(Headers, FTables)> { - let mut rdr = try!(self.rconfig().reader()); - let (headers, sel) = try!(self.sel_headers(&mut rdr)); - Ok((headers, try!(self.ftables(&sel, rdr.byte_records())))) + let mut rdr = self.rconfig().reader()?; + let (headers, sel) = self.sel_headers(&mut rdr)?; + Ok((headers, self.ftables(&sel, rdr.byte_records())?)) } fn parallel_ftables(&self, idx: &mut Indexed) -> CliResult<(Headers, FTables)> { - let mut rdr = try!(self.rconfig().reader()); - let (headers, sel) = try!(self.sel_headers(&mut rdr)); + let mut rdr = self.rconfig().reader()?; + let (headers, sel) = self.sel_headers(&mut rdr)?; if idx.count() == 0 { return Ok((headers, vec![])); @@ -163,15 +164,15 @@ impl Args { } fn ftables(&self, sel: &Selection, it: I) -> CliResult - where I: Iterator> { + where I: Iterator> { let null = &b""[..].to_vec(); let nsel = sel.normal(); let mut tabs: Vec<_> = (0..nsel.len()).map(|_| Frequencies::new()).collect(); for row in it { - let row = try!(row); - for (i, mut field) in nsel.select(row.into_iter()).enumerate() { - field = trim(field); + let row = row?; + for (i, field) in nsel.select(row.into_iter()).enumerate() { + let field = trim(field.to_vec()); if !field.is_empty() { tabs[i].add(field); } else { @@ -185,10 +186,10 @@ impl Args { } fn sel_headers(&self, rdr: &mut csv::Reader) - -> CliResult<(ByteRow, Selection)> { - let headers = try!(rdr.byte_headers()); - let sel = try!(self.rconfig().selection(&*headers)); - Ok((sel.select(&*headers).map(|h| h.to_vec()).collect(), sel)) + -> CliResult<(csv::ByteRecord, Selection)> { + let headers = rdr.byte_headers()?; + let sel = self.rconfig().selection(headers)?; + Ok((sel.select(headers).map(|h| h.to_vec()).collect(), sel)) } fn njobs(&self) -> usize { diff --git a/src/cmd/headers.rs b/src/cmd/headers.rs index 3782c552..5153c111 100644 --- a/src/cmd/headers.rs +++ b/src/cmd/headers.rs @@ -40,17 +40,19 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); - let configs = try!(util::many_configs(&*args.arg_input, - args.flag_delimiter, true)); + let args: Args = util::get_args(USAGE, argv)?; + let configs = util::many_configs( + &*args.arg_input, args.flag_delimiter, true)?; let num_inputs = configs.len(); - let mut headers = vec!(); + let mut headers: Vec> = vec![]; for conf in configs.into_iter() { - let mut rdr = try!(conf.reader()); - for header in try!(rdr.byte_headers()).into_iter() { - if !args.flag_intersect || !headers.contains(&header) { - headers.push(header); + let mut rdr = conf.reader()?; + for header in rdr.byte_headers()?.iter() { + if !args.flag_intersect + || !headers.iter().any(|h| &**h == header) + { + headers.push(header.to_vec()); } } } @@ -63,11 +65,11 @@ pub fn run(argv: &[&str]) -> CliResult<()> { }; for (i, header) in headers.into_iter().enumerate() { if num_inputs == 1 && !args.flag_just_names { - try!(write!(&mut wtr, "{}\t", i+1)); + write!(&mut wtr, "{}\t", i+1)?; } - try!(wtr.write_all(&header)); - try!(wtr.write_all(b"\n")); + wtr.write_all(&header)?; + wtr.write_all(b"\n")?; } - try!(wtr.flush()); + wtr.flush()?; Ok(()) } diff --git a/src/cmd/index.rs b/src/cmd/index.rs index f24c1f85..43a72748 100644 --- a/src/cmd/index.rs +++ b/src/cmd/index.rs @@ -2,7 +2,7 @@ use std::fs; use std::io; use std::path::{Path, PathBuf}; -use csv; +use csv_index::RandomAccessSimple; use CliResult; use config::{Config, Delimiter}; @@ -42,7 +42,7 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); + let args: Args = util::get_args(USAGE, argv)?; let pidx = match args.flag_output { None => util::idx_path(&Path::new(&args.arg_input)), @@ -51,8 +51,8 @@ pub fn run(argv: &[&str]) -> CliResult<()> { let rconfig = Config::new(&Some(args.arg_input)) .delimiter(args.flag_delimiter); - let rdr = try!(rconfig.reader_file()); - let idx = io::BufWriter::new(try!(fs::File::create(&pidx))); - try!(csv::index::create_index(rdr, idx)); + let mut rdr = rconfig.reader_file()?; + let mut wtr = io::BufWriter::new(fs::File::create(&pidx)?); + RandomAccessSimple::create(&mut rdr, &mut wtr)?; Ok(()) } diff --git a/src/cmd/input.rs b/src/cmd/input.rs index 28f6bba2..f16a7120 100644 --- a/src/cmd/input.rs +++ b/src/cmd/input.rs @@ -35,22 +35,22 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); - - let rconfig = Config::new(&args.arg_input) - .delimiter(args.flag_delimiter) - .no_headers(true); + let args: Args = util::get_args(USAGE, argv)?; + let mut rconfig = Config::new(&args.arg_input) + .delimiter(args.flag_delimiter) + .no_headers(true) + .quote(args.flag_quote.as_byte()); let wconfig = Config::new(&args.flag_output); - let mut rdr = try!(rconfig.reader()); - let mut wtr = try!(wconfig.writer()); - rdr = rdr.quote(args.flag_quote.as_byte()); if let Some(escape) = args.flag_escape { - rdr = rdr.escape(Some(escape.as_byte())).double_quote(false); + rconfig = rconfig.escape(escape.as_byte()).double_quote(false); } + + let mut rdr = rconfig.reader()?; + let mut wtr = wconfig.writer()?; for r in rdr.byte_records() { - try!(wtr.write(try!(r).into_iter())); + wtr.write_record(&r?)?; } - try!(wtr.flush()); + wtr.flush()?; Ok(()) } diff --git a/src/cmd/join.rs b/src/cmd/join.rs index 79be11e8..5f501c0d 100644 --- a/src/cmd/join.rs +++ b/src/cmd/join.rs @@ -6,11 +6,11 @@ use std::iter::repeat; use std::str; use byteorder::{WriteBytesExt, BigEndian}; -use csv::{self, ByteString}; -use csv::index::Indexed; +use csv; use CliResult; use config::{Config, Delimiter}; +use index::Indexed; use select::{SelectColumns, Selection}; use util; @@ -71,6 +71,8 @@ Common options: Must be a single character. (default: ,) "; +type ByteString = Vec; + #[derive(RustcDecodable)] struct Args { arg_columns1: SelectColumns, @@ -89,8 +91,8 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); - let mut state = try!(args.new_io_state()); + let args: Args = util::get_args(USAGE, argv)?; + let mut state = args.new_io_state()?; match ( args.flag_left, args.flag_right, @@ -98,23 +100,23 @@ pub fn run(argv: &[&str]) -> CliResult<()> { args.flag_cross, ) { (true, false, false, false) => { - try!(state.write_headers()); + state.write_headers()?; state.outer_join(false) } (false, true, false, false) => { - try!(state.write_headers()); + state.write_headers()?; state.outer_join(true) } (false, false, true, false) => { - try!(state.write_headers()); + state.write_headers()?; state.full_outer_join() } (false, false, false, true) => { - try!(state.write_headers()); + state.write_headers()?; state.cross_join() } (false, false, false, false) => { - try!(state.write_headers()); + state.write_headers()?; state.inner_join() } _ => fail!("Please pick exactly one join operation.") @@ -135,29 +137,29 @@ struct IoState { impl IoState { fn write_headers(&mut self) -> CliResult<()> { if !self.no_headers { - let mut headers = try!(self.rdr1.byte_headers()); - headers.extend(try!(self.rdr2.byte_headers()).into_iter()); - try!(self.wtr.write(headers.into_iter())); + let mut headers = self.rdr1.byte_headers()?.clone(); + headers.extend(self.rdr2.byte_headers()?.iter()); + self.wtr.write_record(&headers)?; } Ok(()) } fn inner_join(mut self) -> CliResult<()> { - let mut validx = try!(ValueIndex::new(self.rdr2, &self.sel2, - self.casei, self.nulls)); + let mut scratch = csv::ByteRecord::new(); + let mut validx = ValueIndex::new( + self.rdr2, &self.sel2, self.casei, self.nulls)?; for row in self.rdr1.byte_records() { - let row = try!(row); + let row = row?; let key = get_row_key(&self.sel1, &row, self.casei); match validx.values.get(&key) { None => continue, Some(rows) => { for &rowi in rows.iter() { - try!(validx.idx.seek(rowi as u64)); + validx.idx.seek(rowi as u64)?; - let mut row1 = row.iter().map(|f| Ok(&**f)); - let row2 = unsafe { validx.idx.byte_fields() }; - let combined = row1.by_ref().chain(row2); - try!(self.wtr.write_iter(combined)); + validx.idx.read_byte_record(&mut scratch)?; + let combined = row.iter().chain(scratch.iter()); + self.wtr.write_record(combined)?; } } } @@ -171,33 +173,30 @@ impl IoState { ::std::mem::swap(&mut self.sel1, &mut self.sel2); } - let (_, pad2) = try!(self.get_padding()); - let mut validx = try!(ValueIndex::new(self.rdr2, &self.sel2, - self.casei, self.nulls)); + let mut scratch = csv::ByteRecord::new(); + let (_, pad2) = self.get_padding()?; + let mut validx = ValueIndex::new( + self.rdr2, &self.sel2, self.casei, self.nulls)?; for row in self.rdr1.byte_records() { - let row = try!(row); - let key = get_row_key(&self.sel1, &*row, self.casei); + let row = row?; + let key = get_row_key(&self.sel1, &row, self.casei); match validx.values.get(&key) { None => { - let row1 = row.iter().map(|f| Ok(&**f)); - let row2 = pad2.iter().map(|f| Ok(&**f)); if right { - try!(self.wtr.write_iter(row2.chain(row1))); + self.wtr.write_record(pad2.iter().chain(&row))?; } else { - try!(self.wtr.write_iter(row1.chain(row2))); + self.wtr.write_record(row.iter().chain(&pad2))?; } } Some(rows) => { for &rowi in rows.iter() { - try!(validx.idx.seek(rowi as u64)); - let row1 = row.iter().map(|f| Ok(&**f)); - let row2 = unsafe { - validx.idx.byte_fields() - }; + validx.idx.seek(rowi as u64)?; + let row1 = row.iter(); + validx.idx.read_byte_record(&mut scratch)?; if right { - try!(self.wtr.write_iter(row2.chain(row1))); + self.wtr.write_record(scratch.iter().chain(row1))?; } else { - try!(self.wtr.write_iter(row1.chain(row2))); + self.wtr.write_record(row1.chain(&scratch))?; } } } @@ -207,32 +206,28 @@ impl IoState { } fn full_outer_join(mut self) -> CliResult<()> { - let (pad1, pad2) = try!(self.get_padding()); - let mut validx = try!(ValueIndex::new(self.rdr2, &self.sel2, - self.casei, self.nulls)); + let mut scratch = csv::ByteRecord::new(); + let (pad1, pad2) = self.get_padding()?; + let mut validx = ValueIndex::new( + self.rdr2, &self.sel2, self.casei, self.nulls)?; // Keep track of which rows we've written from rdr2. let mut rdr2_written: Vec<_> = repeat(false).take(validx.num_rows).collect(); for row1 in self.rdr1.byte_records() { - let row1 = try!(row1); - let key = get_row_key(&self.sel1, &*row1, self.casei); + let row1 = row1?; + let key = get_row_key(&self.sel1, &row1, self.casei); match validx.values.get(&key) { None => { - let row1 = row1.iter().map(|f| Ok(&**f)); - let row2 = pad2.iter().map(|f| Ok(&**f)); - try!(self.wtr.write_iter(row1.chain(row2))); + self.wtr.write_record(row1.iter().chain(&pad2))?; } Some(rows) => { for &rowi in rows.iter() { rdr2_written[rowi] = true; - try!(validx.idx.seek(rowi as u64)); - let row1 = row1.iter().map(|f| Ok(&**f)); - let row2 = unsafe { - validx.idx.byte_fields() - }; - try!(self.wtr.write_iter(row1.chain(row2))); + validx.idx.seek(rowi as u64)?; + validx.idx.read_byte_record(&mut scratch)?; + self.wtr.write_record(row1.iter().chain(&scratch))?; } } } @@ -242,46 +237,41 @@ impl IoState { // from rdr1. for (i, &written) in rdr2_written.iter().enumerate() { if !written { - try!(validx.idx.seek(i as u64)); - let row1 = pad1.iter().map(|f| Ok(&**f)); - let row2 = unsafe { - validx.idx.byte_fields() - }; - try!(self.wtr.write_iter(row1.chain(row2))); + validx.idx.seek(i as u64)?; + validx.idx.read_byte_record(&mut scratch)?; + self.wtr.write_record(pad1.iter().chain(&scratch))?; } } Ok(()) } fn cross_join(mut self) -> CliResult<()> { + let mut pos = csv::Position::new(); + pos.set_byte(0); + let mut row2 = csv::ByteRecord::new(); for row1 in self.rdr1.byte_records() { - let row1 = try!(row1); - - try!(self.rdr2.seek(0)); - let mut first = true; - while !self.rdr2.done() { - // Skip the header row. The raw byte interface won't - // do it for us. - if first && !self.no_headers { - while let Some(f) = - self.rdr2.next_bytes().into_iter_result() { try!(f); } - first = false; - } - let row1 = row1.iter().map(|f| Ok(&**f)); - let row2 = unsafe { self.rdr2.byte_fields() }; - try!(self.wtr.write_iter(row1.chain(row2))); + let row1 = row1?; + self.rdr2.seek(pos.clone())?; + if self.rdr2.has_headers() { + // Read and skip the header row, since CSV readers disable + // the header skipping logic after being seeked. + self.rdr2.read_byte_record(&mut row2)?; + } + while self.rdr2.read_byte_record(&mut row2)? { + self.wtr.write_record(row1.iter().chain(&row2))?; } } Ok(()) } - fn get_padding(&mut self) - -> CliResult<(Vec, Vec)> { - let len1 = try!(self.rdr1.byte_headers()).len(); - let len2 = try!(self.rdr2.byte_headers()).len(); + fn get_padding( + &mut self, + ) -> CliResult<(csv::ByteRecord, csv::ByteRecord)> { + let len1 = self.rdr1.byte_headers()?.len(); + let len2 = self.rdr2.byte_headers()?.len(); Ok(( - repeat(util::empty_field()).take(len1).collect(), - repeat(util::empty_field()).take(len2).collect(), + repeat(b"").take(len1).collect(), + repeat(b"").take(len2).collect(), )) } } @@ -290,20 +280,20 @@ impl Args { fn new_io_state(&self) -> CliResult>> { let rconf1 = Config::new(&Some(self.arg_input1.clone())) - .delimiter(self.flag_delimiter) - .no_headers(self.flag_no_headers) - .select(self.arg_columns1.clone()); + .delimiter(self.flag_delimiter) + .no_headers(self.flag_no_headers) + .select(self.arg_columns1.clone()); let rconf2 = Config::new(&Some(self.arg_input2.clone())) - .delimiter(self.flag_delimiter) - .no_headers(self.flag_no_headers) - .select(self.arg_columns2.clone()); - - let mut rdr1 = try!(rconf1.reader_file()); - let mut rdr2 = try!(rconf2.reader_file()); - let (sel1, sel2) = try!(self.get_selections(&rconf1, &mut rdr1, - &rconf2, &mut rdr2)); + .delimiter(self.flag_delimiter) + .no_headers(self.flag_no_headers) + .select(self.arg_columns2.clone()); + + let mut rdr1 = rconf1.reader_file()?; + let mut rdr2 = rconf2.reader_file()?; + let (sel1, sel2) = self.get_selections( + &rconf1, &mut rdr1, &rconf2, &mut rdr2)?; Ok(IoState { - wtr: try!(Config::new(&self.flag_output).writer()), + wtr: Config::new(&self.flag_output).writer()?, rdr1: rdr1, sel1: sel1, rdr2: rdr2, @@ -314,15 +304,15 @@ impl Args { }) } - fn get_selections - (&self, - rconf1: &Config, rdr1: &mut csv::Reader, - rconf2: &Config, rdr2: &mut csv::Reader) - -> CliResult<(Selection, Selection)> { - let headers1 = try!(rdr1.byte_headers()); - let headers2 = try!(rdr2.byte_headers()); - let select1 = try!(rconf1.selection(&*headers1)); - let select2 = try!(rconf2.selection(&*headers2)); + fn get_selections( + &self, + rconf1: &Config, rdr1: &mut csv::Reader, + rconf2: &Config, rdr2: &mut csv::Reader, + ) -> CliResult<(Selection, Selection)> { + let headers1 = rdr1.byte_headers()?; + let headers2 = rdr2.byte_headers()?; + let select1 = rconf1.selection(&*headers1)?; + let select2 = rconf2.selection(&*headers2)?; if select1.len() != select2.len() { return fail!(format!( "Column selections must have the same number of columns, \ @@ -341,40 +331,44 @@ struct ValueIndex { } impl ValueIndex { - fn new(mut rdr: csv::Reader, sel: &Selection, - casei: bool, nulls: bool) - -> CliResult> { + fn new( + mut rdr: csv::Reader, + sel: &Selection, + casei: bool, + nulls: bool, + ) -> CliResult> { let mut val_idx = HashMap::with_capacity(10000); let mut row_idx = io::Cursor::new(Vec::with_capacity(8 * 10000)); let (mut rowi, mut count) = (0usize, 0usize); - let row_len = try!(rdr.byte_headers()).len(); // This logic is kind of tricky. Basically, we want to include // the header row in the line index (because that's what csv::index // does), but we don't want to include header values in the ValueIndex. - if !rdr.has_headers { + if !rdr.has_headers() { // ... so if there are no headers, we seek to the beginning and // index everything. - try!(rdr.seek(0)); + let mut pos = csv::Position::new(); + pos.set_byte(0); + rdr.seek(pos)?; } else { // ... and if there are headers, we make sure that we've parsed // them, and write the offset of the header row to the index. - try!(rdr.byte_headers()); - try!(row_idx.write_u64::(0)); + rdr.byte_headers()?; + row_idx.write_u64::(0)?; count += 1; } - while !rdr.done() { - // This is a bit hokey. We're doing this manually instead of - // calling `csv::index::create` so we can create both indexes - // in one pass. - try!(row_idx.write_u64::(rdr.byte_offset())); - - let mut row = Vec::with_capacity(row_len); - while let Some(r) = rdr.next_bytes().into_iter_result() { - row.push(try!(r).to_vec()); - } - let fields: Vec<_> = sel.select(&row).map(|v| transform(v, casei)).collect(); + let mut row = csv::ByteRecord::new(); + while rdr.read_byte_record(&mut row)? { + // This is a bit hokey. We're doing this manually instead of using + // the `csv-index` crate directly so that we can create both + // indexes in one pass. + row_idx.write_u64::(row.position().unwrap().byte())?; + + let fields: Vec<_> = sel + .select(&row) + .map(|v| transform(v, casei)) + .collect(); if nulls || !fields.iter().any(|f| f.is_empty()) { match val_idx.entry(fields) { Entry::Vacant(v) => { @@ -382,17 +376,20 @@ impl ValueIndex { rows.push(rowi); v.insert(rows); } - Entry::Occupied(mut v) => { v.get_mut().push(rowi); } + Entry::Occupied(mut v) => { + v.get_mut().push(rowi); + } } } rowi += 1; count += 1; } - try!(row_idx.write_u64::(count as u64)); + + row_idx.write_u64::(count as u64)?; + let idx = Indexed::open(rdr, io::Cursor::new(row_idx.into_inner()))?; Ok(ValueIndex { values: val_idx, - idx: try!(Indexed::open(rdr, - io::Cursor::new(row_idx.into_inner()))), + idx: idx, num_rows: rowi, }) } @@ -408,14 +405,17 @@ impl fmt::Debug for ValueIndex { let keys = keys.iter() .map(|k| String::from_utf8(k.to_vec()).unwrap()) .collect::>(); - try!(writeln!(f, "({}) => {:?}", keys.connect(", "), rows)) + writeln!(f, "({}) => {:?}", keys.join(", "), rows)? } Ok(()) } } -fn get_row_key(sel: &Selection, row: &[ByteString], casei: bool) - -> Vec { +fn get_row_key( + sel: &Selection, + row: &csv::ByteRecord, + casei: bool, +) -> Vec { sel.select(row).map(|v| transform(&v, casei)).collect() } diff --git a/src/cmd/partition.rs b/src/cmd/partition.rs index 5fe2c969..687302ca 100644 --- a/src/cmd/partition.rs +++ b/src/cmd/partition.rs @@ -53,8 +53,8 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); - try!(fs::create_dir_all(&args.arg_outdir)); + let args: Args = util::get_args(USAGE, argv)?; + fs::create_dir_all(&args.arg_outdir)?; // It would be nice to support efficient parallel partitions, but doing // do would involve more complicated inter-thread communication, with @@ -73,9 +73,12 @@ impl Args { } /// Get the column to use as a key. - fn key_column(&self, rconfig: &Config, headers: &[csv::ByteString]) - -> CliResult { - let select_cols = try!(rconfig.selection(&*headers)); + fn key_column( + &self, + rconfig: &Config, + headers: &csv::ByteRecord, + ) -> CliResult { + let select_cols = rconfig.selection(headers)?; if select_cols.len() == 1 { Ok(select_cols[0]) } else { @@ -86,36 +89,36 @@ impl Args { /// A basic sequential partition. fn sequential_partition(&self) -> CliResult<()> { let rconfig = self.rconfig(); - let mut rdr = try!(rconfig.reader()); - let headers = try!(rdr.byte_headers()); - let key_col = try!(self.key_column(&rconfig, &*headers)); + let mut rdr = rconfig.reader()?; + let headers = rdr.byte_headers()?.clone(); + let key_col = self.key_column(&rconfig, &headers)?; let mut gen = WriterGenerator::new(self.flag_filename.clone()); - let mut writers: HashMap = + let mut writers: HashMap, BoxedWriter> = HashMap::new(); for row in rdr.byte_records() { - let row = try!(row); + let row = row?; // Decide what file to put this in. - let column = row[key_col].clone(); + let column = &row[key_col]; let key = match self.flag_prefix_length { // We exceed --prefix-length, so ignore the extra bytes. Some(len) if len < column.len() => &column[0..len], _ => &column[..], }; - let mut entry = writers.entry(key.to_owned()); + let mut entry = writers.entry(key.to_vec()); let wtr = match entry { Entry::Occupied(ref mut occupied) => occupied.get_mut(), Entry::Vacant(vacant) => { // We have a new key, so make a new writer. - let mut wtr = try!(gen.writer(&*self.arg_outdir, key)); + let mut wtr = gen.writer(&*self.arg_outdir, key)?; if !rconfig.no_headers { - try!(wtr.write(headers.iter())); + wtr.write_record(&headers)?; } vacant.insert(wtr) } }; - try!(wtr.write(row.into_iter())); + wtr.write_record(&row)?; } Ok(()) } @@ -137,13 +140,12 @@ impl WriterGenerator { template: template, counter: 1, used: HashSet::new(), - non_word_char: Regex::new(r#"\W"#).unwrap(), + non_word_char: Regex::new(r"\W").unwrap(), } } /// Create a CSV writer for `key`. Does not add headers. - fn writer

(&mut self, path: P, key: &[u8]) - -> io::Result + fn writer

(&mut self, path: P, key: &[u8]) -> io::Result where P: AsRef { let unique_value = self.unique_value(key); @@ -156,12 +158,13 @@ impl WriterGenerator { fn unique_value(&mut self, key: &[u8]) -> String { // Sanitize our key. let utf8 = String::from_utf8_lossy(key); - let safe = self.non_word_char.replace_all(&*utf8, ""); - let base = if safe.is_empty() { - "empty".to_owned() - } else { - safe - }; + let safe = self.non_word_char.replace_all(&*utf8, "").into_owned(); + let base = + if safe.is_empty() { + "empty".to_owned() + } else { + safe + }; // Now check for collisions. if !self.used.contains(&base) { diff --git a/src/cmd/sample.rs b/src/cmd/sample.rs index 34400c9f..20bef81d 100644 --- a/src/cmd/sample.rs +++ b/src/cmd/sample.rs @@ -1,11 +1,11 @@ use std::io; -use csv::{self, ByteString}; -use csv::index::Indexed; +use csv; use rand::Rng; use CliResult; use config::{Config, Delimiter}; +use index::Indexed; use util; static USAGE: &'static str = " @@ -48,60 +48,64 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); + let args: Args = util::get_args(USAGE, argv)?; let rconfig = Config::new(&args.arg_input) - .delimiter(args.flag_delimiter) - .no_headers(args.flag_no_headers); + .delimiter(args.flag_delimiter) + .no_headers(args.flag_no_headers); let sample_size = args.arg_sample_size; - let mut wtr = try!(Config::new(&args.flag_output).writer()); - let sampled = match try!(rconfig.indexed()) { + let mut wtr = Config::new(&args.flag_output).writer()?; + let sampled = match rconfig.indexed()? { Some(mut idx) => { if do_random_access(sample_size, idx.count()) { - try!(rconfig.write_headers(&mut *idx, &mut wtr)); - try!(sample_random_access(&mut idx, sample_size)) + rconfig.write_headers(&mut *idx, &mut wtr)?; + sample_random_access(&mut idx, sample_size)? } else { - let mut rdr = try!(rconfig.reader()); - try!(rconfig.write_headers(&mut rdr, &mut wtr)); - try!(sample_reservoir(&mut rdr, sample_size)) + let mut rdr = rconfig.reader()?; + rconfig.write_headers(&mut rdr, &mut wtr)?; + sample_reservoir(&mut rdr, sample_size)? } } _ => { - let mut rdr = try!(rconfig.reader()); - try!(rconfig.write_headers(&mut rdr, &mut wtr)); - try!(sample_reservoir(&mut rdr, sample_size)) + let mut rdr = rconfig.reader()?; + rconfig.write_headers(&mut rdr, &mut wtr)?; + sample_reservoir(&mut rdr, sample_size)? } }; for row in sampled.into_iter() { - try!(wtr.write(row.into_iter())); + wtr.write_record(&row)?; } - Ok(try!(wtr.flush())) + Ok(wtr.flush()?) } -fn sample_random_access - (idx: &mut Indexed, sample_size: u64) - -> CliResult>> { +fn sample_random_access( + idx: &mut Indexed, + sample_size: u64, +) -> CliResult> +where R: io::Read + io::Seek, I: io::Read + io::Seek +{ let mut all_indices = (0..idx.count()).collect::>(); let mut rng = ::rand::thread_rng(); rng.shuffle(&mut *all_indices); let mut sampled = Vec::with_capacity(sample_size as usize); for i in all_indices.into_iter().take(sample_size as usize) { - try!(idx.seek(i)); - sampled.push(try!(idx.byte_records().next().unwrap())); + idx.seek(i)?; + sampled.push(idx.byte_records().next().unwrap()?); } Ok(sampled) } -fn sample_reservoir - (rdr: &mut csv::Reader, sample_size: u64) - -> CliResult>> { +fn sample_reservoir( + rdr: &mut csv::Reader, + sample_size: u64, +) -> CliResult> { // The following algorithm has been adapted from: // http://en.wikipedia.org/wiki/Reservoir_sampling let mut reservoir = Vec::with_capacity(sample_size as usize); let mut records = rdr.byte_records().enumerate(); for (_, row) in records.by_ref().take(reservoir.capacity()) { - reservoir.push(try!(row)); + reservoir.push(row?); } // Now do the sampling. @@ -109,7 +113,7 @@ fn sample_reservoir for (i, row) in records { let random = rng.gen_range(0, i+1); if random < sample_size as usize { - reservoir[random] = try!(row); + reservoir[random] = row?; } } Ok(reservoir) diff --git a/src/cmd/search.rs b/src/cmd/search.rs index 0aa11764..7ebc506e 100644 --- a/src/cmd/search.rs +++ b/src/cmd/search.rs @@ -1,4 +1,4 @@ -use regex::RegexBuilder; +use regex::bytes::RegexBuilder; use CliResult; use config::{Config, Delimiter}; @@ -47,35 +47,33 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); - let pattern = try!( - RegexBuilder::new(&*args.arg_regex) - .case_insensitive(args.flag_ignore_case) - .compile() - ); + let args: Args = util::get_args(USAGE, argv)?; + let pattern = RegexBuilder::new(&*args.arg_regex) + .case_insensitive(args.flag_ignore_case) + .build()?; let rconfig = Config::new(&args.arg_input) - .delimiter(args.flag_delimiter) - .no_headers(args.flag_no_headers) - .select(args.flag_select); + .delimiter(args.flag_delimiter) + .no_headers(args.flag_no_headers) + .select(args.flag_select); - let mut rdr = try!(rconfig.reader()); - let mut wtr = try!(Config::new(&args.flag_output).writer()); + let mut rdr = rconfig.reader()?; + let mut wtr = Config::new(&args.flag_output).writer()?; - let headers = try!(rdr.byte_headers()); - let nsel = try!(rconfig.normal_selection(&*headers)); + let headers = rdr.byte_headers()?.clone(); + let nsel = rconfig.normal_selection(&headers)?; if !rconfig.no_headers { - try!(wtr.write(headers.iter())); + wtr.write_record(&headers)?; } - for row in rdr.records() { - let row = try!(row); - let mut m = nsel.select(row.iter()).any(|f| pattern.is_match(&**f)); + for row in rdr.byte_records() { + let row = row?; + let mut m = nsel.select(row.iter()).any(|f| pattern.is_match(f)); if args.flag_invert_match { m = !m; } if m { - try!(wtr.write(row.iter().map(|f| &**f))); + wtr.write_record(&row)?; } } - Ok(try!(wtr.flush())) + Ok(wtr.flush()?) } diff --git a/src/cmd/select.rs b/src/cmd/select.rs index 87cda8f8..7c3a3711 100644 --- a/src/cmd/select.rs +++ b/src/cmd/select.rs @@ -1,3 +1,5 @@ +use csv; + use CliResult; use config::{Config, Delimiter}; use select::SelectColumns; @@ -55,29 +57,26 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); + let args: Args = util::get_args(USAGE, argv)?; let rconfig = Config::new(&args.arg_input) - .delimiter(args.flag_delimiter) - .no_headers(args.flag_no_headers) - .select(args.arg_selection); + .delimiter(args.flag_delimiter) + .no_headers(args.flag_no_headers) + .select(args.arg_selection); - let mut rdr = try!(rconfig.reader()); - let mut wtr = try!(Config::new(&args.flag_output).writer()); + let mut rdr = rconfig.reader()?; + let mut wtr = Config::new(&args.flag_output).writer()?; - let headers = try!(rdr.byte_headers()); - let sel = try!(rconfig.selection(&*headers)); + let headers = rdr.byte_headers()?.clone(); + let sel = rconfig.selection(&headers)?; if !rconfig.no_headers { - try!(wtr.write(sel.iter().map(|&i| &*headers[i]))); + wtr.write_record(sel.iter().map(|&i| &headers[i]))?; } - for r in rdr.byte_records() { - // TODO: I don't think we can do any better here. Since selection - // operates on indices, some kind of allocation is probably required. - // try!(wtr.write(sel.select(try!(r)[]))) - let r = try!(r); - try!(wtr.write(sel.iter().map(|&i| &*r[i]))); + let mut record = csv::ByteRecord::new(); + while rdr.read_byte_record(&mut record)? { + wtr.write_record(sel.iter().map(|&i| &record[i]))?; } - try!(wtr.flush()); + wtr.flush()?; Ok(()) } diff --git a/src/cmd/slice.rs b/src/cmd/slice.rs index 464fb29b..5a6e7e0d 100644 --- a/src/cmd/slice.rs +++ b/src/cmd/slice.rs @@ -1,9 +1,9 @@ use std::fs; -use csv::index::Indexed; use CliResult; use config::{Config, Delimiter}; +use index::Indexed; use util; static USAGE: &'static str = " @@ -54,8 +54,8 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); - match try!(args.rconfig().indexed()) { + let args: Args = util::get_args(USAGE, argv)?; + match args.rconfig().indexed()? { None => args.no_index(), Some(idxed) => args.with_index(idxed), } @@ -63,42 +63,45 @@ pub fn run(argv: &[&str]) -> CliResult<()> { impl Args { fn no_index(&self) -> CliResult<()> { - let mut rdr = try!(self.rconfig().reader()); - let mut wtr = try!(self.wconfig().writer()); - try!(self.rconfig().write_headers(&mut rdr, &mut wtr)); + let mut rdr = self.rconfig().reader()?; + let mut wtr = self.wconfig().writer()?; + self.rconfig().write_headers(&mut rdr, &mut wtr)?; - let (start, end) = try!(self.range()); + let (start, end) = self.range()?; for r in rdr.byte_records().skip(start).take(end - start) { - try!(wtr.write(try!(r).into_iter())); + wtr.write_record(&r?)?; } - Ok(try!(wtr.flush())) + Ok(wtr.flush()?) } - fn with_index(&self, mut idx: Indexed) - -> CliResult<()> { - let mut wtr = try!(self.wconfig().writer()); - try!(self.rconfig().write_headers(&mut *idx, &mut wtr)); + fn with_index( + &self, + mut idx: Indexed, + ) -> CliResult<()> { + let mut wtr = self.wconfig().writer()?; + self.rconfig().write_headers(&mut *idx, &mut wtr)?; - let (start, end) = try!(self.range()); + let (start, end) = self.range()?; if end - start == 0 { return Ok(()); } - try!(idx.seek(start as u64)); + idx.seek(start as u64)?; for r in idx.byte_records().take(end - start) { - try!(wtr.write(try!(r).into_iter())); + wtr.write_record(&r?)?; } - Ok(try!(wtr.flush())) + wtr.flush()?; + Ok(()) } fn range(&self) -> Result<(usize, usize), String> { - util::range(self.flag_start, self.flag_end, - self.flag_len, self.flag_index) + util::range( + self.flag_start, self.flag_end, self.flag_len, self.flag_index) } fn rconfig(&self) -> Config { Config::new(&self.arg_input) - .delimiter(self.flag_delimiter) - .no_headers(self.flag_no_headers) + .delimiter(self.flag_delimiter) + .no_headers(self.flag_no_headers) } fn wconfig(&self) -> Config { diff --git a/src/cmd/sort.rs b/src/cmd/sort.rs index bf6beaef..0db828c2 100644 --- a/src/cmd/sort.rs +++ b/src/cmd/sort.rs @@ -43,54 +43,53 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); - + let args: Args = util::get_args(USAGE, argv)?; let numeric = args.flag_numeric; let reverse = args.flag_reverse; let rconfig = Config::new(&args.arg_input) - .delimiter(args.flag_delimiter) - .no_headers(args.flag_no_headers) - .select(args.flag_select); + .delimiter(args.flag_delimiter) + .no_headers(args.flag_no_headers) + .select(args.flag_select); - let mut rdr = try!(rconfig.reader()); - let mut wtr = try!(Config::new(&args.flag_output).writer()); + let mut rdr = rconfig.reader()?; + let mut wtr = Config::new(&args.flag_output).writer()?; - let headers = try!(rdr.byte_headers()); - let sel = try!(rconfig.selection(&*headers)); + let headers = rdr.byte_headers()?.clone(); + let sel = rconfig.selection(&headers)?; - let mut all = try!(rdr.byte_records().collect::, _>>()); + let mut all = rdr.byte_records().collect::, _>>()?; match (numeric, reverse) { (false, false) => all.sort_by(|r1, r2| { - let a = sel.select(r1.as_slice()); - let b = sel.select(r2.as_slice()); + let a = sel.select(r1); + let b = sel.select(r2); iter_cmp(a, b) }), (true, false) => all.sort_by(|r1, r2| { - let a = sel.select(r1.as_slice()); - let b = sel.select(r2.as_slice()); + let a = sel.select(r1); + let b = sel.select(r2); iter_cmp_num(a, b) }), (false, true) => all.sort_by(|r1, r2| { - let a = sel.select(r1.as_slice()); - let b = sel.select(r2.as_slice()); + let a = sel.select(r1); + let b = sel.select(r2); iter_cmp(b, a) }), (true, true) => all.sort_by(|r1, r2| { - let a = sel.select(r1.as_slice()); - let b = sel.select(r2.as_slice()); + let a = sel.select(r1); + let b = sel.select(r2); iter_cmp_num(b, a) }), } - try!(rconfig.write_headers(&mut rdr, &mut wtr)); + rconfig.write_headers(&mut rdr, &mut wtr)?; for r in all.into_iter() { - try!(wtr.write(r.into_iter())); + wtr.write_record(&r)?; } - Ok(try!(wtr.flush())) + Ok(wtr.flush()?) } /// Order `a` and `b` lexicographically using `Ord` diff --git a/src/cmd/split.rs b/src/cmd/split.rs index 70ad0cca..01b90a5a 100644 --- a/src/cmd/split.rs +++ b/src/cmd/split.rs @@ -4,11 +4,11 @@ use std::path::Path; use chan; use csv; -use csv::index::Indexed; use threadpool::ThreadPool; use CliResult; use config::{Config, Delimiter}; +use index::Indexed; use util::{self, FilenameTemplate}; static USAGE: &'static str = " @@ -58,13 +58,13 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); + let args: Args = util::get_args(USAGE, argv)?; if args.flag_size == 0 { return fail!("--size must be greater than 0."); } - try!(fs::create_dir_all(&args.arg_outdir)); + fs::create_dir_all(&args.arg_outdir)?; - match try!(args.rconfig().indexed()) { + match args.rconfig().indexed()? { Some(idx) => args.parallel_split(idx), None => args.sequential_split(), } @@ -73,26 +73,27 @@ pub fn run(argv: &[&str]) -> CliResult<()> { impl Args { fn sequential_split(&self) -> CliResult<()> { let rconfig = self.rconfig(); - let mut rdr = try!(rconfig.reader()); - let headers = try!(rdr.byte_headers()); + let mut rdr = rconfig.reader()?; + let headers = rdr.byte_headers()?.clone(); - let mut wtr = try!(self.new_writer(&*headers, 0)); + let mut wtr = self.new_writer(&headers, 0)?; for (i, row) in rdr.byte_records().enumerate() { if i > 0 && i % self.flag_size == 0 { - try!(wtr.flush()); - wtr = try!(self.new_writer(&*headers, i)); + wtr.flush()?; + wtr = self.new_writer(&headers, i)?; } - let row = try!(row); - try!(wtr.write(row.into_iter())); + wtr.write_record(&row?)?; } - try!(wtr.flush()); + wtr.flush()?; Ok(()) } - fn parallel_split(&self, idx: Indexed) - -> CliResult<()> { - let nchunks = util::num_of_chunks(idx.count() as usize, - self.flag_size); + fn parallel_split( + &self, + idx: Indexed, + ) -> CliResult<()> { + let nchunks = util::num_of_chunks( + idx.count() as usize, self.flag_size); let pool = ThreadPool::new(self.njobs()); let wg = chan::WaitGroup::new(); for i in 0..nchunks { @@ -102,14 +103,15 @@ impl Args { pool.execute(move || { let conf = args.rconfig(); let mut idx = conf.indexed().unwrap().unwrap(); - let headers = idx.byte_headers().unwrap(); - let mut wtr = args.new_writer(&*headers, i * args.flag_size) - .unwrap(); + let headers = idx.byte_headers().unwrap().clone(); + let mut wtr = args + .new_writer(&headers, i * args.flag_size) + .unwrap(); idx.seek((i * args.flag_size) as u64).unwrap(); for row in idx.byte_records().take(args.flag_size) { let row = row.unwrap(); - wtr.write(row.into_iter()).unwrap(); + wtr.write_record(row.into_iter()).unwrap(); } wtr.flush().unwrap(); wg.done(); @@ -119,25 +121,32 @@ impl Args { Ok(()) } - fn new_writer(&self, headers: &[csv::ByteString], start: usize) - -> CliResult>> { + fn new_writer( + &self, + headers: &csv::ByteRecord, + start: usize, + ) -> CliResult>> { let dir = Path::new(&self.arg_outdir); let path = dir.join(self.flag_filename.filename(&format!("{}", start))); let spath = Some(path.display().to_string()); - let mut wtr = try!(Config::new(&spath).writer()); + let mut wtr = Config::new(&spath).writer()?; if !self.rconfig().no_headers { - try!(wtr.write(headers.iter().map(|f| &**f))); + wtr.write_record(headers)?; } Ok(wtr) } fn rconfig(&self) -> Config { Config::new(&self.arg_input) - .delimiter(self.flag_delimiter) - .no_headers(self.flag_no_headers) + .delimiter(self.flag_delimiter) + .no_headers(self.flag_no_headers) } fn njobs(&self) -> usize { - if self.flag_jobs == 0 { util::num_cpus() } else { self.flag_jobs } + if self.flag_jobs == 0 { + util::num_cpus() + } else { + self.flag_jobs + } } } diff --git a/src/cmd/stats.rs b/src/cmd/stats.rs index 339c1c23..893324c0 100644 --- a/src/cmd/stats.rs +++ b/src/cmd/stats.rs @@ -3,17 +3,17 @@ use std::default::Default; use std::fmt; use std::fs; use std::io; -use std::iter::repeat; +use std::iter::{FromIterator, repeat}; use std::str::{self, FromStr}; use chan; -use csv::{self, ByteString}; -use csv::index::Indexed; +use csv; use stats::{Commute, OnlineStats, MinMax, Unsorted, merge_all}; use threadpool::ThreadPool; use CliResult; use config::{Config, Delimiter}; +use index::Indexed; use select::{SelectColumns, Selection}; use util; @@ -83,10 +83,10 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); + let args: Args = util::get_args(USAGE, argv)?; - let mut wtr = try!(Config::new(&args.flag_output).writer()); - let (headers, stats) = try!(match try!(args.rconfig().indexed()) { + let mut wtr = Config::new(&args.flag_output).writer()?; + let (headers, stats) = match args.rconfig().indexed()? { None => args.sequential_stats(), Some(idx) => { if args.flag_jobs == 1 { @@ -95,43 +95,45 @@ pub fn run(argv: &[&str]) -> CliResult<()> { args.parallel_stats(idx) } } - }); + }?; let stats = args.stats_to_records(stats); - try!(wtr.write(args.stat_headers().iter())); + wtr.write_record(&args.stat_headers())?; let fields = headers.iter().zip(stats.into_iter()); for (i, (header, stat)) in fields.enumerate() { - let header = if args.flag_no_headers { - i.to_string().into_bytes() - } else { - header.clone() - }; - let row = vec![&*header].into_iter() - .chain(stat.iter().map(|f| f.as_bytes())); - try!(wtr.write(row)); + let header = + if args.flag_no_headers { + i.to_string().into_bytes() + } else { + header.to_vec() + }; + let stat = stat.iter().map(|f| f.as_bytes()); + wtr.write_record(vec![&*header].into_iter().chain(stat))?; } + wtr.flush()?; Ok(()) } impl Args { - fn sequential_stats(&self) - -> CliResult<(Vec, Vec)> { - let mut rdr = try!(self.rconfig().reader()); - let (headers, sel) = try!(self.sel_headers(&mut rdr)); - let stats = try!(self.compute(&sel, rdr.byte_records())); + fn sequential_stats(&self) -> CliResult<(csv::ByteRecord, Vec)> { + let mut rdr = self.rconfig().reader()?; + let (headers, sel) = self.sel_headers(&mut rdr)?; + let stats = self.compute(&sel, rdr.byte_records())?; Ok((headers, stats)) } - fn parallel_stats(&self, idx: Indexed) - -> CliResult<(Vec, Vec)> { + fn parallel_stats( + &self, + idx: Indexed, + ) -> CliResult<(csv::ByteRecord, Vec)> { // N.B. This method doesn't handle the case when the number of records - // is zero correctly. (So we use `sequential_stats` instead. + // is zero correctly. So we use `sequential_stats` instead. if idx.count() == 0 { return self.sequential_stats(); } - let mut rdr = try!(self.rconfig().reader()); - let (headers, sel) = try!(self.sel_headers(&mut rdr)); + let mut rdr = self.rconfig().reader()?; + let (headers, sel) = self.sel_headers(&mut rdr)?; let chunk_size = util::chunk_size(idx.count() as usize, self.njobs()); let nchunks = util::num_of_chunks(idx.count() as usize, chunk_size); @@ -151,8 +153,10 @@ impl Args { Ok((headers, merge_all(recv.iter()).unwrap_or_else(Vec::new))) } - fn stats_to_records(&self, stats: Vec) -> Vec> { - let mut records: Vec<_> = repeat(vec![]).take(stats.len()).collect(); + fn stats_to_records(&self, stats: Vec) -> Vec { + let mut records: Vec<_> = repeat(csv::StringRecord::new()) + .take(stats.len()) + .collect(); let pool = ThreadPool::new(self.njobs()); let mut results = vec![]; for mut stat in stats.into_iter() { @@ -167,29 +171,31 @@ impl Args { } fn compute(&self, sel: &Selection, it: I) -> CliResult> - where I: Iterator>> { + where I: Iterator> { let mut stats = self.new_stats(sel.len()); for row in it { - let row = try!(row); - for (i, field) in sel.select(&*row).enumerate() { + let row = row?; + for (i, field) in sel.select(&row).enumerate() { stats[i].add(field); } } Ok(stats) } - fn sel_headers(&self, rdr: &mut csv::Reader) - -> CliResult<(Vec, Selection)> { - let headers = try!(rdr.byte_headers()); - let sel = try!(self.rconfig().selection(&*headers)); - Ok((sel.select(&*headers).map(|h| h.to_vec()).collect(), sel)) + fn sel_headers( + &self, + rdr: &mut csv::Reader, + ) -> CliResult<(csv::ByteRecord, Selection)> { + let headers = rdr.byte_headers()?.clone(); + let sel = self.rconfig().selection(&headers)?; + Ok((csv::ByteRecord::from_iter(sel.select(&headers)), sel)) } fn rconfig(&self) -> Config { Config::new(&self.arg_input) - .delimiter(self.flag_delimiter) - .no_headers(self.flag_no_headers) - .select(self.flag_select.clone()) + .delimiter(self.flag_delimiter) + .no_headers(self.flag_no_headers) + .select(self.flag_select.clone()) } fn njobs(&self) -> usize { @@ -208,7 +214,7 @@ impl Args { })).take(record_len).collect() } - fn stat_headers(&self) -> Vec { + fn stat_headers(&self) -> csv::StringRecord { let mut fields = vec![ "field", "type", "sum", "min", "max", "min_length", "max_length", "mean", "stddev", @@ -217,7 +223,7 @@ impl Args { if self.flag_median || all { fields.push("median"); } if self.flag_mode || all { fields.push("mode"); } if self.flag_cardinality || all { fields.push("cardinality"); } - fields.into_iter().map(|s| s.to_owned()).collect() + csv::StringRecord::from(fields) } } @@ -244,7 +250,7 @@ struct Stats { sum: Option, minmax: Option, online: Option, - mode: Option>, + mode: Option>>, median: Option>, which: WhichStats, } @@ -299,7 +305,7 @@ impl Stats { } } - fn to_record(&mut self) -> Vec { + fn to_record(&mut self) -> csv::StringRecord { let typ = self.typ; let mut pieces = vec![]; let empty = || "".to_owned(); @@ -348,7 +354,7 @@ impl Stats { } Some(ref mut v) => { if self.which.mode { - let lossy = |s: ByteString| -> String { + let lossy = |s: Vec| -> String { String::from_utf8_lossy(&*s).into_owned() }; pieces.push( @@ -359,7 +365,7 @@ impl Stats { } } } - pieces + csv::StringRecord::from(pieces) } } @@ -506,7 +512,7 @@ impl Commute for TypedSum { /// where min/max makes sense. #[derive(Clone)] struct TypedMinMax { - strings: MinMax, + strings: MinMax>, str_len: MinMax, integers: MinMax, floats: MinMax, diff --git a/src/cmd/table.rs b/src/cmd/table.rs index 0a97d4bc..16d63388 100644 --- a/src/cmd/table.rs +++ b/src/cmd/table.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; +use csv; use tabwriter::TabWriter; use CliResult; @@ -46,26 +47,25 @@ struct Args { } pub fn run(argv: &[&str]) -> CliResult<()> { - let args: Args = try!(util::get_args(USAGE, argv)); - + let args: Args = util::get_args(USAGE, argv)?; let rconfig = Config::new(&args.arg_input) - .delimiter(args.flag_delimiter) - .no_headers(true); + .delimiter(args.flag_delimiter) + .no_headers(true); let wconfig = Config::new(&args.flag_output) - .delimiter(Some(Delimiter(b'\t'))); + .delimiter(Some(Delimiter(b'\t'))); - let tw = TabWriter::new(try!(wconfig.io_writer())) - .minwidth(args.flag_width) - .padding(args.flag_pad); + let tw = TabWriter::new(wconfig.io_writer()?) + .minwidth(args.flag_width) + .padding(args.flag_pad); let mut wtr = wconfig.from_writer(tw); - let mut rdr = try!(rconfig.reader()); + let mut rdr = rconfig.reader()?; - for r in rdr.byte_records() { - let r = try!(r); - let row = r.iter().map(|f| util::condense(Cow::Borrowed(&**f), - args.flag_condense)); - try!(wtr.write(row)); + let mut record = csv::ByteRecord::new(); + while rdr.read_byte_record(&mut record)? { + wtr.write_record(record.iter().map(|f| { + util::condense(Cow::Borrowed(f), args.flag_condense) + }))?; } - try!(wtr.flush()); + wtr.flush()?; Ok(()) } diff --git a/src/config.rs b/src/config.rs index fe3522f4..41cd9a70 100644 --- a/src/config.rs +++ b/src/config.rs @@ -7,7 +7,7 @@ use std::ops::Deref; use std::path::PathBuf; use csv; -use csv::index::Indexed; +use index::Indexed; use rustc_serialize::{Decodable, Decoder}; use CliResult; @@ -31,7 +31,7 @@ impl Delimiter { impl Decodable for Delimiter { fn decode(d: &mut D) -> Result { - let c = try!(d.read_str()); + let c = d.read_str()?; match &*c { r"\t" => Ok(Delimiter(b'\t')), s => { @@ -60,7 +60,11 @@ pub struct Config { delimiter: u8, pub no_headers: bool, flexible: bool, - crlf: bool, + terminator: csv::Terminator, + quote: u8, + quote_style: csv::QuoteStyle, + double_quote: bool, + escape: u8, } impl Config { @@ -86,7 +90,11 @@ impl Config { delimiter: delim, no_headers: false, flexible: false, - crlf: false, + terminator: csv::Terminator::Any(b'\n'), + quote: b'"', + quote_style: csv::QuoteStyle::Necessary, + double_quote: true, + escape: b'\\', } } @@ -111,7 +119,36 @@ impl Config { } pub fn crlf(mut self, yes: bool) -> Config { - self.crlf = yes; + if yes { + self.terminator = csv::Terminator::CRLF; + } else { + self.terminator = csv::Terminator::Any(b'\n'); + } + self + } + + pub fn terminator(mut self, term: csv::Terminator) -> Config { + self.terminator = term; + self + } + + pub fn quote(mut self, quote: u8) -> Config { + self.quote = quote; + self + } + + pub fn quote_style(mut self, style: csv::QuoteStyle) -> Config { + self.quote_style = style; + self + } + + pub fn double_quote(mut self, yes: bool) -> Config { + self.double_quote = yes; + self + } + + pub fn escape(mut self, escape: u8) -> Config { + self.escape = escape; self } @@ -124,8 +161,10 @@ impl Config { self.path.is_none() } - pub fn selection(&self, first_record: &[csv::ByteString]) - -> Result { + pub fn selection( + &self, + first_record: &csv::ByteRecord, + ) -> Result { match self.select_columns { None => Err("Config has no 'SelectColums'. Did you call \ Config::select?".to_owned()), @@ -133,8 +172,10 @@ impl Config { } } - pub fn normal_selection(&self, first_record: &[csv::ByteString]) - -> Result { + pub fn normal_selection( + &self, + first_record: &csv::ByteRecord, + ) -> Result { self.selection(first_record).map(|sel| sel.normal()) } @@ -142,9 +183,9 @@ impl Config { (&self, r: &mut csv::Reader, w: &mut csv::Writer) -> csv::Result<()> { if !self.no_headers { - let r = try!(r.byte_headers()); + let r = r.byte_headers()?; if !r.is_empty() { - try!(w.write(r.into_iter())); + w.write_record(r)?; } } Ok(()) @@ -152,12 +193,12 @@ impl Config { pub fn writer(&self) -> io::Result>> { - Ok(self.from_writer(try!(self.io_writer()))) + Ok(self.from_writer(self.io_writer()?)) } pub fn reader(&self) -> io::Result>> { - Ok(self.from_reader(try!(self.io_reader()))) + Ok(self.from_reader(self.io_reader()?)) } pub fn reader_file(&self) -> io::Result> { @@ -187,25 +228,22 @@ impl Config { Err(_) => return Ok(None), Ok(f) => f, }; - (try!(fs::File::open(p)), idx_file) + (fs::File::open(p)?, idx_file) } (&Some(ref p), &Some(ref ip)) => { - (try!(fs::File::open(p)), try!(fs::File::open(ip))) + (fs::File::open(p)?, fs::File::open(ip)?) } }; // If the CSV data was last modified after the index file was last // modified, then return an error and demand the user regenerate the // index. - let data_modified = util::last_modified(&try!(csv_file.metadata())); - let idx_modified = util::last_modified(&try!(idx_file.metadata())); + let data_modified = util::last_modified(&csv_file.metadata()?); + let idx_modified = util::last_modified(&idx_file.metadata()?); if data_modified > idx_modified { return Err(io::Error::new( io::ErrorKind::Other, "The CSV file was modified after the index file. \ Please re-create the index.", - // Some(format!("CSV file: {}, index file: {}", - // csv_file.path().unwrap().to_string_lossy(), - // idx_file.path().unwrap().to_string_lossy())), )); } let csv_rdr = self.from_reader(csv_file); @@ -214,9 +252,9 @@ impl Config { pub fn indexed(&self) -> CliResult>> { - match try!(self.index_files()) { + match self.index_files()? { None => Ok(None), - Some((r, i)) => Ok(Some(try!(Indexed::open(r, i)))), + Some((r, i)) => Ok(Some(Indexed::open(r, i)?)), } } @@ -240,25 +278,29 @@ impl Config { } pub fn from_reader(&self, rdr: R) -> csv::Reader { - csv::Reader::from_reader(rdr) - .flexible(self.flexible) - .delimiter(self.delimiter) - .has_headers(!self.no_headers) + csv::ReaderBuilder::new() + .flexible(self.flexible) + .delimiter(self.delimiter) + .has_headers(!self.no_headers) + .from_reader(rdr) } pub fn io_writer(&self) -> io::Result> { Ok(match self.path { None => Box::new(io::stdout()), - Some(ref p) => Box::new(try!(fs::File::create(p))), + Some(ref p) => Box::new(fs::File::create(p)?), }) } pub fn from_writer(&self, wtr: W) -> csv::Writer { - let term = if self.crlf { csv::RecordTerminator::CRLF } - else { csv::RecordTerminator::Any(b'\n') }; - csv::Writer::from_writer(wtr) - .flexible(self.flexible) - .delimiter(self.delimiter) - .record_terminator(term) + csv::WriterBuilder::new() + .flexible(self.flexible) + .delimiter(self.delimiter) + .terminator(self.terminator) + .quote(self.quote) + .quote_style(self.quote_style) + .double_quote(self.double_quote) + .escape(self.escape) + .from_writer(wtr) } } diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 00000000..239c289d --- /dev/null +++ b/src/index.rs @@ -0,0 +1,61 @@ +use std::io; +use std::ops; + +use csv; +use csv_index::RandomAccessSimple; + +use CliResult; + +/// Indexed composes a CSV reader with a simple random access index. +pub struct Indexed { + csv_rdr: csv::Reader, + idx: RandomAccessSimple, +} + +impl ops::Deref for Indexed { + type Target = csv::Reader; + fn deref(&self) -> &csv::Reader { &self.csv_rdr } +} + +impl ops::DerefMut for Indexed { + fn deref_mut(&mut self) -> &mut csv::Reader { &mut self.csv_rdr } +} + +impl Indexed { + /// Opens an index. + pub fn open( + csv_rdr: csv::Reader, + idx_rdr: I, + ) -> CliResult> { + Ok(Indexed { + csv_rdr: csv_rdr, + idx: RandomAccessSimple::open(idx_rdr)?, + }) + } + + /// Return the number of records (not including the header record) in this + /// index. + pub fn count(&self) -> u64 { + if self.csv_rdr.has_headers() && !self.idx.is_empty() { + self.idx.len() - 1 + } else { + self.idx.len() + } + } + + /// Seek to the starting position of record `i`. + pub fn seek(&mut self, mut i: u64) -> CliResult<()> { + if i >= self.count() { + let msg = format!( + "invalid record index {} (there are {} records)", + i, self.count()); + return fail!(io::Error::new(io::ErrorKind::Other, msg)); + } + if self.csv_rdr.has_headers() { + i += 1; + } + let pos = self.idx.get(i)?; + self.csv_rdr.seek(pos)?; + Ok(()) + } +} diff --git a/src/main.rs b/src/main.rs index 8b5977bb..ca0f5ce8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,12 +1,7 @@ -/*! -These are some docs. -*/ - -#![allow(deprecated)] // for connect -> join rename - extern crate byteorder; extern crate chan; extern crate csv; +extern crate csv_index; extern crate docopt; extern crate filetime; extern crate num_cpus; @@ -71,6 +66,7 @@ macro_rules! command_list { mod cmd; mod config; +mod index; mod select; mod util; diff --git a/src/select.rs b/src/select.rs index b87a59b3..1053c749 100644 --- a/src/select.rs +++ b/src/select.rs @@ -6,9 +6,8 @@ use std::ops; use std::slice; use std::str::FromStr; -use rustc_serialize::{Decodable, Decoder}; - use csv; +use rustc_serialize::{Decodable, Decoder}; #[derive(Clone)] pub struct SelectColumns { @@ -26,13 +25,16 @@ impl SelectColumns { false }; Ok(SelectColumns { - selectors: try!(SelectorParser::new(s).parse()), + selectors: SelectorParser::new(s).parse()?, invert: invert, }) } - pub fn selection(&self, first_record: &[csv::ByteString], use_names: bool) - -> Result { + pub fn selection( + &self, + first_record: &csv::ByteRecord, + use_names: bool, + ) -> Result { if self.selectors.is_empty() { return Ok(Selection(if self.invert { // Inverting everything means we get nothing. @@ -45,7 +47,7 @@ impl SelectColumns { let mut map = vec![]; for sel in &self.selectors { let idxs = sel.indices(first_record, use_names); - map.extend(try!(idxs).into_iter()); + map.extend(idxs?.into_iter()); } if self.invert { let set: HashSet<_> = map.into_iter().collect(); @@ -69,14 +71,14 @@ impl fmt::Debug for SelectColumns { let strs: Vec<_> = self.selectors .iter().map(|sel| format!("{:?}", sel)).collect(); - write!(f, "{}", strs.connect(", ")) + write!(f, "{}", strs.join(", ")) } } } impl Decodable for SelectColumns { fn decode(d: &mut D) -> Result { - SelectColumns::parse(&*try!(d.read_str())) + SelectColumns::parse(&*d.read_str()?) .map_err(|e| d.error(&e)) } } @@ -101,7 +103,7 @@ impl SelectorParser { if self.cur() == Some('-') { OneSelector::Start } else { - try!(self.parse_one()) + self.parse_one()? }; let f2: Option = if self.cur() == Some('-') { @@ -109,7 +111,7 @@ impl SelectorParser { Some(if self.is_end_of_selector() { OneSelector::End } else { - try!(self.parse_one()) + self.parse_one()? }) } else { None @@ -132,12 +134,12 @@ impl SelectorParser { let name = if self.cur() == Some('"') { self.bump(); - try!(self.parse_quoted_name()) + self.parse_quoted_name()? } else { - try!(self.parse_name()) + self.parse_name()? }; Ok(if self.cur() == Some('[') { - let idx = try!(self.parse_index()); + let idx = self.parse_index()?; OneSelector::IndexedName(name, idx) } else { match FromStr::from_str(&name) { @@ -234,15 +236,18 @@ enum OneSelector { } impl Selector { - fn indices(&self, first_record: &[csv::ByteString], use_names: bool) - -> Result, String> { + fn indices( + &self, + first_record: &csv::ByteRecord, + use_names: bool, + ) -> Result, String> { match *self { Selector::One(ref sel) => { sel.index(first_record, use_names).map(|i| vec![i]) } Selector::Range(ref sel1, ref sel2) => { - let i1 = try!(sel1.index(first_record, use_names)); - let i2 = try!(sel2.index(first_record, use_names)); + let i1 = sel1.index(first_record, use_names)?; + let i2 = sel2.index(first_record, use_names)?; Ok(match i1.cmp(&i2) { Ordering::Equal => vec!(i1), Ordering::Less => (i1..(i2 + 1)).collect(), @@ -262,8 +267,11 @@ impl Selector { } impl OneSelector { - fn index(&self, first_record: &[csv::ByteString], use_names: bool) - -> Result { + fn index( + &self, + first_record: &csv::ByteRecord, + use_names: bool, + ) -> Result { match *self { OneSelector::Start => Ok(0), OneSelector::End => Ok( @@ -290,7 +298,7 @@ impl OneSelector { } let mut num_found = 0; for (i, field) in first_record.iter().enumerate() { - if *field == s.as_bytes() { + if field == s.as_bytes() { if num_found == sidx { return Ok(i); } @@ -337,17 +345,17 @@ impl fmt::Debug for OneSelector { pub struct Selection(Vec); pub type _GetField = - for <'c> fn(&mut &'c [csv::ByteString], &usize) -> Option<&'c [u8]>; + for <'c> fn(&mut &'c csv::ByteRecord, &usize) -> Option<&'c [u8]>; impl Selection { - pub fn select<'a, 'b>(&'a self, row: &'b [csv::ByteString]) + pub fn select<'a, 'b>(&'a self, row: &'b csv::ByteRecord) -> iter::Scan< slice::Iter<'a, usize>, - &'b [csv::ByteString], + &'b csv::ByteRecord, _GetField, > { // This is horrifying. - fn get_field<'c>(row: &mut &'c [csv::ByteString], idx: &usize) + fn get_field<'c>(row: &mut &'c csv::ByteRecord, idx: &usize) -> Option<&'c [u8]> { Some(&row[*idx]) } diff --git a/src/util.rs b/src/util.rs index ff80fa97..b06823e3 100644 --- a/src/util.rs +++ b/src/util.rs @@ -51,7 +51,7 @@ pub fn many_configs(inps: &[String], delim: Option, .delimiter(delim) .no_headers(no_headers)) .collect::>(); - try!(errif_greater_one_stdin(&*confs)); + errif_greater_one_stdin(&*confs)?; Ok(confs) } @@ -63,8 +63,6 @@ pub fn errif_greater_one_stdin(inps: &[Config]) -> Result<(), String> { Ok(()) } -pub fn empty_field() -> csv::ByteString { vec![] } - pub fn chunk_size(nitems: usize, njobs: usize) -> usize { if nitems < njobs { nitems @@ -93,11 +91,6 @@ pub fn condense<'a>(val: Cow<'a, [u8]>, n: Option) -> Cow<'a, [u8]> { match n { None => val, Some(n) => { - // It would be much nicer to just use a `match` here, but the - // borrow checker won't allow it. ---AG - // - // (We could circumvent it by allocating a new Unicode string, - // but that seems excessive.) let mut is_short_utf8 = false; if let Ok(s) = str::from_utf8(&*val) { if n >= s.chars().count() { @@ -212,7 +205,7 @@ impl FilenameTemplate { impl Decodable for FilenameTemplate { fn decode(d: &mut D) -> Result { - let raw = try!(d.read_str()); + let raw = d.read_str()?; let chunks = raw.split("{}").collect::>(); if chunks.len() == 2 { Ok(FilenameTemplate { diff --git a/tests/test_frequency.rs b/tests/test_frequency.rs index 35ec1ea2..ef96e29a 100644 --- a/tests/test_frequency.rs +++ b/tests/test_frequency.rs @@ -166,7 +166,7 @@ fn param_prop_frequency(name: &str, rows: CsvData, idx: bool) -> bool { type FTables = HashMap>; -#[derive(RustcDecodable)] +#[derive(Deserialize, RustcDecodable)] struct FRow { field: String, value: String, @@ -197,9 +197,9 @@ fn ftables_from_rows(rows: T) -> FTables { } fn ftables_from_csv_string(data: String) -> FTables { - let mut rdr = csv::Reader::from_string(data); + let mut rdr = csv::Reader::from_reader(data.as_bytes()); let mut ftables = HashMap::new(); - for frow in rdr.decode() { + for frow in rdr.deserialize() { let frow: FRow = frow.unwrap(); match ftables.entry(frow.field) { Entry::Vacant(v) => { diff --git a/tests/tests.rs b/tests/tests.rs index 26d32af6..2856d228 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -1,5 +1,10 @@ -#[macro_use] extern crate log; +#![allow(dead_code)] + +#[macro_use] +extern crate log; extern crate rustc_serialize; +#[macro_use] +extern crate serde_derive; extern crate csv; extern crate filetime; diff --git a/tests/workdir.rs b/tests/workdir.rs index 04533656..111ade33 100644 --- a/tests/workdir.rs +++ b/tests/workdir.rs @@ -35,6 +35,7 @@ impl Workdir { let dir = root.join(XSV_INTEGRATION_TEST_DIR) .join(name) .join(&format!("test-{}", id)); + // println!("{:?}", dir); if let Err(err) = create_dir_all(&dir) { panic!("Could not create '{:?}': {}", dir, err); } @@ -47,10 +48,12 @@ impl Workdir { } pub fn create(&self, name: &str, rows: T) { - let mut wtr = csv::Writer::from_file(&self.path(name)) - .unwrap().flexible(self.flexible); + let mut wtr = csv::WriterBuilder::new() + .flexible(self.flexible) + .from_path(&self.path(name)) + .unwrap(); for row in rows.to_vecs().into_iter() { - wtr.write(row.iter()).unwrap(); + wtr.write_record(row).unwrap(); } wtr.flush().unwrap(); } @@ -65,8 +68,18 @@ impl Workdir { pub fn read_stdout(&self, cmd: &mut process::Command) -> T { let stdout: String = self.stdout(cmd); - let mut rdr = csv::Reader::from_string(stdout).has_headers(false); - Csv::from_vecs(rdr.records().collect::>().unwrap()) + let mut rdr = csv::ReaderBuilder::new() + .has_headers(false) + .from_reader(io::Cursor::new(stdout)); + + let records: Vec> = rdr + .records() + .collect::, _>>() + .unwrap() + .into_iter() + .map(|r| r.iter().map(|f| f.to_string()).collect()) + .collect(); + Csv::from_vecs(records) } pub fn command(&self, sub_command: &str) -> process::Command { @@ -77,6 +90,7 @@ impl Workdir { pub fn output(&self, cmd: &mut process::Command) -> process::Output { debug!("[{}]: {:?}", self.dir.display(), cmd); + println!("[{}]: {:?}", self.dir.display(), cmd); let o = cmd.output().unwrap(); if !o.status.success() { panic!("\n\n===== {:?} =====\n\