forked from dathere/qsv
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathto.rs
292 lines (238 loc) · 9.93 KB
/
to.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
static USAGE: &str = r#"
Convert CSV files to XLSX/POSTGRES/SQLITE/PARQUET
POSTGRES
To convert to postgres you need to supply connection string.
The format is described here - https://docs.rs/postgres/latest/postgres/config/struct.Config.html#examples-1.
Additionally you can use `env=MY_ENV_VAR` and qsv will get the connection string from the
environment variable `MY_ENV_VAR`.
Examples:
Load `file1.csv` and `file2.csv' file to local database `test`, with user `testuser`, and password `pass`.
$ qsv to postgres 'postgres://testuser:pass@localhost/test' file1.csv file2.csv
Load same files into a new/existing postgres schema `myschema`
$ qsv to postgres 'postgres://testuser:pass@localhost/test' --schema=myschema file1.csv file2.csv
Load same files into a new/existing postgres database whose connection string is in the
`DATABASE_URL` environment variable.
$ qsv to postgres 'env=DATABASE_URL' file1.csv file2.csv
Drop tables if they exist before loading.
$ qsv to postgres 'postgres://testuser:pass@localhost/test' --drop file1.csv file2.csv
Evolve tables if they exist before loading. Read http://datapackage_convert.opendata.coop/evolve.html
to explain how evolving works.
$ qsv to postgres 'postgres://testuser:pass@localhost/test' --evolve file1.csv file2.csv
SQLITE
Convert to sqlite db file. Will be created if it does not exist.
Examples:
Load `file1.csv` and `file2.csv' files to sqlite database `test.db`
$ qsv to sqlite test.db file1.csv file2.csv
Drop tables if they exist before loading.
$ qsv to sqlite test.db --drop file1.csv file2.csv
Evolve tables if they exist. Read http://datapackage_convert.opendata.coop/evolve.html
to explain how evolving is done.
$ qsv to sqlite test.db --evolve file1.csv file2.csv
XLSX
Convert to new xlsx file.
Examples:
Load `file1.csv` and `file2.csv' into xlsx file
$ qsv to xlsx output.xlsx file1.csv file2.csv
PARQUET
Convert to directory of parquet files. Need to select a directory, it will be created if it does not exists.
Examples:
Convert `file1.csv` and `file2.csv' into `mydir/file1.parquet` and `mydir/file2.parquet` files.
$ qsv to parquet mydir file1.csv file2.csv
DATAPACKAGE
Generate a datapackage, which contains stats and information about what is in the CSV files.
Examples:
Generate a `datapackage.json` file from `file1.csv` and `file2.csv' files.
$ qsv to datapackage datapackage.json file1.csv file2.csv
Add more stats to datapackage.
$ qsv to datapackage datapackage.json --stats file1.csv file2.csv
For all other conversions you can output the datapackage created by specifying `--print-package`.
$ qsv to xlsx datapackage.xlsx --stats --print-package file1.csv file2.csv
Usage:
qsv to postgres [options] <connection> [<input>...]
qsv to sqlite [options] <sqlite> [<input>...]
qsv to xlsx [options] <xlsx> [<input>...]
qsv to parquet [options] <parquet> [<input>...]
qsv to datapackage [options] <datapackage> [<input>...]
qsv to --help
options:
-k --print-package Print statistics as datapackage, by default will print field summary.
-a --stats Produce extra statistics about the data beyond just type guessing.
-c --stats-csv <path> Output stats as CSV to specified file.
-q --quiet Do not print out field summary.
-t --threads <num> Use this amount of threads when calculating stats/type guessing.
-s --schema <arg> The schema to load the data into. (postgres only)
-d --drop Drop tables before loading new data into them (postgres/sqlite only)
-e --evolve If loading into existing db, alter existing tables so that new data will load. (postgres/sqlite only)
-p --separator For xlsx, use this character to help truncate xlsx sheet names, defaults to space.
Common options:
-h, --help Display this message
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
"#;
use std::{io::Write, path::PathBuf};
use csvs_convert::{
csvs_to_parquet_with_options, csvs_to_postgres_with_options, csvs_to_sqlite_with_options,
csvs_to_xlsx_with_options, make_datapackage, DescribeOptions, Options,
};
use log::debug;
use serde::Deserialize;
use crate::{
config::{self, Delimiter},
util, CliResult,
};
#[allow(dead_code)]
#[derive(Deserialize)]
struct Args {
cmd_postgres: bool,
arg_connection: Option<String>,
cmd_sqlite: bool,
arg_sqlite: Option<String>,
cmd_parquet: bool,
arg_parquet: Option<String>,
cmd_xlsx: bool,
arg_xlsx: Option<String>,
cmd_datapackage: bool,
arg_datapackage: Option<String>,
arg_input: Vec<PathBuf>,
flag_delimiter: Option<Delimiter>,
flag_schema: Option<String>,
flag_separator: Option<String>,
flag_drop: bool,
flag_evolve: bool,
flag_stats: bool,
flag_stats_csv: Option<String>,
flag_threads: Option<usize>,
flag_print_package: bool,
flag_quiet: bool,
}
pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;
debug!("'to' command running");
let options = Options::builder()
.delimiter(args.flag_delimiter.map(config::Delimiter::as_byte))
.schema(args.flag_schema.unwrap_or_default())
.seperator(args.flag_separator.unwrap_or_else(|| " ".into()))
.evolve(args.flag_evolve)
.stats(args.flag_stats)
.stats_csv(args.flag_stats_csv.unwrap_or_default())
.drop(args.flag_drop)
.threads(args.flag_threads.unwrap_or(0))
.build();
let output;
if args.cmd_postgres {
debug!("converting to postgres");
if args.arg_input.is_empty() {
return fail_clierror!(
"Need to add connection string as first argument then the input CSVs"
);
}
output = csvs_to_postgres_with_options(
args.arg_connection.expect("checked above"),
args.arg_input,
options,
)?;
debug!("conversion to postgres complete");
} else if args.cmd_sqlite {
debug!("converting to sqlite");
if args.arg_input.is_empty() {
return fail_clierror!(
"Need to add the name of a sqlite db as first argument then the input CSVs"
);
}
output = csvs_to_sqlite_with_options(
args.arg_sqlite.expect("checked above"),
args.arg_input,
options,
)?;
debug!("conversion to xlsx complete");
} else if args.cmd_parquet {
debug!("converting to parquet");
if args.arg_input.is_empty() {
return fail_clierror!(
"Need to add the directory of the parquet files as first argument then the input \
CSVs"
);
}
output = csvs_to_parquet_with_options(
args.arg_parquet.expect("checked above"),
args.arg_input,
options,
)?;
debug!("conversion to parquet complete");
} else if args.cmd_xlsx {
debug!("converting to xlsx");
if args.arg_input.is_empty() {
return fail_clierror!(
"Need to add the name of a xlsx file as first argument then the input CSVs"
);
}
output = csvs_to_xlsx_with_options(
args.arg_xlsx.expect("checked above"),
args.arg_input,
options,
)?;
debug!("conversion to xlsx complete");
} else if args.cmd_datapackage {
debug!("creating datapackage");
if args.arg_input.is_empty() {
return fail_clierror!(
"Need to add the name of a datapackage file as first argument then the input CSVs"
);
}
let describe_options = DescribeOptions::builder()
.delimiter(options.delimiter)
.stats(options.stats)
.threads(options.threads)
.stats_csv(options.stats_csv);
output = make_datapackage(args.arg_input, PathBuf::new(), &describe_options.build())?;
let file = std::fs::File::create(args.arg_datapackage.expect("checked above"))?;
serde_json::to_writer_pretty(file, &output)?;
debug!("datapackage complete");
} else {
return fail_clierror!("Need to supply either xlsx,parquet,postgres,sqlite as command");
}
if args.flag_print_package {
println!(
"{}",
serde_json::to_string_pretty(&output).expect("values should be serializable")
);
} else if !args.flag_quiet {
let empty_array = vec![];
for resource in output["resources"].as_array().unwrap_or(&empty_array) {
let mut stdout = std::io::stdout();
writeln!(&mut stdout)?;
writeln!(
&mut stdout,
"Table '{}' ({} rows)",
resource["name"].as_str().unwrap_or(""),
resource["row_count"].as_i64().unwrap_or(0)
)?;
writeln!(&mut stdout)?;
let mut tabwriter = tabwriter::TabWriter::new(stdout);
writeln!(
&mut tabwriter,
"{}",
["Field Name", "Field Type", "Field Format"].join("\t")
)?;
for field in resource["schema"]["fields"]
.as_array()
.unwrap_or(&empty_array)
{
writeln!(
&mut tabwriter,
"{}",
[
field["name"].as_str().unwrap_or(""),
field["type"].as_str().unwrap_or(""),
field["format"].as_str().unwrap_or("")
]
.join("\t")
)?;
}
tabwriter.flush()?;
}
let mut stdout = std::io::stdout();
writeln!(&mut stdout)?;
}
Ok(())
}