forked from dathere/qsv
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscramble.rs
77 lines (67 loc) · 2.31 KB
/
scramble.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
use std::io;
use rand::seq::SliceRandom;
use crate::config::{Config, Delimiter};
use crate::index::Indexed;
use crate::util;
use crate::CliResult;
use serde::Deserialize;
static USAGE: &str = "
Randomly scrambles CSV records uniformly using memory proportional to the size of
the CSV.
Usage:
qsv scramble [options] [<input>]
qsv scramble --help
Common options:
-h, --help Display this message
-o, --output <file> Write output to <file> instead of stdout.
-n, --no-headers When set, the first row will be considered as part of
the data to scramble. (When not set, the
first row is the header row and will always appear
in the output.)
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
";
#[derive(Deserialize)]
struct Args {
arg_input: Option<String>,
flag_output: Option<String>,
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
let rconfig = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
.no_headers(args.flag_no_headers);
let mut wtr = Config::new(&args.flag_output).writer()?;
let scrambled = match rconfig.indexed()? {
Some(mut idx) => {
rconfig.write_headers(&mut *idx, &mut wtr)?;
scramble_random_access(&mut idx)?
}
_ => {
// scrambling requires an index
return fail!("Scrambling requires an index.");
}
};
for row in scrambled.into_iter() {
wtr.write_byte_record(&row)?;
}
Ok(wtr.flush()?)
}
fn scramble_random_access<R, I>(idx: &mut Indexed<R, I>) -> CliResult<Vec<csv::ByteRecord>>
where
R: io::Read + io::Seek,
I: io::Read + io::Seek,
{
let idxcount = idx.count();
let mut all_indices = (0..idxcount).collect::<Vec<_>>();
let mut rng = ::rand::thread_rng();
SliceRandom::shuffle(&mut *all_indices, &mut rng);
let mut scrambled = Vec::with_capacity(idxcount as usize);
for i in all_indices.into_iter().take(idxcount as usize) {
idx.seek(i)?;
scrambled.push(idx.byte_records().next().unwrap()?);
}
Ok(scrambled)
}