Skip to content

Commit be78e67

Browse files
committed
Reimplement duplicate column handling while awake
I was tired and in a hurry when I implemented `--replace`, and I didn't like the code. Here's a new version with a more flexible `--duplicate-columns` flag and much cleaner code. This also adds an integration test suite which actually runs the tool from the command line and uses the SmartyStreets API.
1 parent aca0318 commit be78e67

File tree

6 files changed

+334
-64
lines changed

6 files changed

+334
-64
lines changed

src/addresses.rs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -183,15 +183,15 @@ impl<Key: Default + Eq> AddressColumnSpec<Key> {
183183
self.address_columns_by_prefix.get(prefix)
184184
}
185185

186-
/// What column indices should we remove from the input records in order
186+
/// What column should we remove from the input records in order
187187
/// to prevent duplicate columns?
188188
///
189-
/// These indices will always be returned in order.
190-
pub fn column_indices_to_remove(
189+
/// Returns the name and index of each column to remove, in order.
190+
pub fn duplicate_columns<'header>(
191191
&self,
192192
structure: &Structure,
193-
header: &StringRecord,
194-
) -> Result<Vec<usize>> {
193+
header: &'header StringRecord,
194+
) -> Result<Vec<(&'header str, usize)>> {
195195
// Get all our column names for all prefixes, and insert them into a
196196
// hash table.
197197
let mut output_column_names = HashSet::new();
@@ -204,13 +204,13 @@ impl<Key: Default + Eq> AddressColumnSpec<Key> {
204204
}
205205

206206
// Decide which columns of `header` need to be removed.
207-
let mut indices_to_remove = vec![];
207+
let mut duplicate_columns = vec![];
208208
for (i, col) in header.iter().enumerate() {
209209
if output_column_names.contains(col) {
210-
indices_to_remove.push(i);
210+
duplicate_columns.push((col, i));
211211
}
212212
}
213-
Ok(indices_to_remove)
213+
Ok(duplicate_columns)
214214
}
215215
}
216216

@@ -235,8 +235,8 @@ fn find_columns_to_remove() {
235235
let structure = Structure::complete().unwrap();
236236
let header =
237237
StringRecord::from_iter(&["existing", "home_addressee", "work_addressee"]);
238-
let indices = spec.column_indices_to_remove(&structure, &header).unwrap();
239-
assert_eq!(indices, vec![1, 2]);
238+
let indices = spec.duplicate_columns(&structure, &header).unwrap();
239+
assert_eq!(indices, vec![("home_addressee", 1), ("work_addressee", 2)]);
240240
}
241241

242242
impl AddressColumnSpec<String> {

src/geocoder.rs

Lines changed: 73 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use log::{debug, error, trace, warn};
1010
use std::{
1111
cmp::max, io, iter::FromIterator, sync::Arc, thread::sleep, time::Duration,
1212
};
13+
use strum_macros::EnumString;
1314
use tokio::{
1415
prelude::*,
1516
sync::mpsc::{self, Receiver, Sender},
@@ -32,6 +33,19 @@ const CONCURRENCY: usize = 48;
3233
/// The number of addresses to pass to SmartyStreets at one time.
3334
const GEOCODE_SIZE: usize = 72;
3435

36+
/// What should we do if a geocoding output column has the same as a column in
37+
/// the input?
38+
#[derive(Debug, Clone, Copy, EnumString, Eq, PartialEq)]
39+
#[strum(serialize_all = "snake_case")]
40+
pub enum OnDuplicateColumns {
41+
/// Fail with an error.
42+
Error,
43+
/// Replace existing columns with the same name.
44+
Replace,
45+
/// Leave the old columns in place and append the new ones.
46+
Append,
47+
}
48+
3549
/// Data about the CSV file that we include with every chunk to be geocoded.
3650
struct Shared {
3751
/// Which columns contain addresses that we need to geocode?
@@ -65,7 +79,7 @@ enum Message {
6579
pub async fn geocode_stdio(
6680
spec: AddressColumnSpec<String>,
6781
match_strategy: MatchStrategy,
68-
replace_existing_columns: bool,
82+
on_duplicate_columns: OnDuplicateColumns,
6983
structure: Structure,
7084
) -> Result<()> {
7185
// Set up bounded channels for communication between the sync and async
@@ -76,7 +90,7 @@ pub async fn geocode_stdio(
7690
// Hook up our inputs and outputs, which are synchronous functions running
7791
// in their own threads.
7892
let read_fut = run_sync_fn_in_background("read CSV".to_owned(), move || {
79-
read_csv_from_stdin(spec, structure, replace_existing_columns, in_tx)
93+
read_csv_from_stdin(spec, structure, on_duplicate_columns, in_tx)
8094
});
8195
let write_fut = run_sync_fn_in_background("write CSV".to_owned(), move || {
8296
write_csv_to_stdout(out_rx)
@@ -152,7 +166,7 @@ pub async fn geocode_stdio(
152166
fn read_csv_from_stdin(
153167
spec: AddressColumnSpec<String>,
154168
structure: Structure,
155-
replace_existing_columns: bool,
169+
on_duplicate_columns: OnDuplicateColumns,
156170
mut tx: Sender<Message>,
157171
) -> Result<()> {
158172
// Open up our CSV file and get the headers.
@@ -161,47 +175,56 @@ fn read_csv_from_stdin(
161175
let mut in_headers = rdr.headers()?.to_owned();
162176
debug!("input headers: {:?}", in_headers);
163177

164-
// Look for duplicate input columns, and decide what to do.
165-
let column_indices_to_remove =
166-
spec.column_indices_to_remove(&structure, &in_headers)?;
167-
let remove_column_flags = if column_indices_to_remove.is_empty() {
168-
// No columns to remove!
169-
None
170-
} else {
171-
// We have to remove columns, so make a human-readable list...
172-
let mut removed_names = vec![];
173-
for i in &column_indices_to_remove {
174-
removed_names.push(&in_headers[*i]);
175-
}
176-
let removed_names = removed_names.join(" ");
177-
178-
// And decide whether we're actually allowed to remove them or not.
179-
if replace_existing_columns {
180-
warn!("removing input columns: {}", removed_names);
178+
// Figure out if we have any duplicate columns.
179+
let (duplicate_column_indices, duplicate_column_names) = {
180+
let duplicate_columns = spec.duplicate_columns(&structure, &in_headers)?;
181+
let indices = duplicate_columns
182+
.iter()
183+
.map(|name_idx| name_idx.1)
184+
.collect::<Vec<_>>();
185+
let names = duplicate_columns
186+
.iter()
187+
.map(|name_idx| name_idx.0)
188+
.collect::<Vec<_>>()
189+
.join(", ");
190+
(indices, names)
191+
};
181192

182-
// Build the vector of bools specifying whether columns should
183-
// stay or go.
184-
let mut flags = vec![false; in_headers.len()];
185-
for i in column_indices_to_remove {
186-
flags[i] = true;
193+
// If we do have duplicate columns, figure out what to do about it.
194+
let mut should_remove_columns = false;
195+
let mut remove_column_flags = vec![false; in_headers.len()];
196+
if !duplicate_column_indices.is_empty() {
197+
match on_duplicate_columns {
198+
OnDuplicateColumns::Error => {
199+
return Err(format_err!(
200+
"input columns would conflict with geocoding columns: {}",
201+
duplicate_column_names,
202+
));
203+
}
204+
OnDuplicateColumns::Replace => {
205+
warn!("replacing input columns: {}", duplicate_column_names);
206+
should_remove_columns = true;
207+
for i in duplicate_column_indices.iter().cloned() {
208+
remove_column_flags[i] = true;
209+
}
210+
}
211+
OnDuplicateColumns::Append => {
212+
warn!(
213+
"output contains duplicate columns: {}",
214+
duplicate_column_names,
215+
);
187216
}
188-
Some(flags)
189-
} else {
190-
return Err(format_err!(
191-
"input columns would conflict with geocoding columns: {}",
192-
removed_names,
193-
));
194217
}
195-
};
218+
}
196219

197220
// Remove any duplicate columns from our input headers.
198-
if let Some(remove_column_flags) = &remove_column_flags {
221+
if should_remove_columns {
199222
in_headers = remove_columns(&in_headers, &remove_column_flags);
200223
}
201224

202225
// Convert our column spec from using header names to header indices.
203226
//
204-
// This needs to use "post-removal" indices!
227+
// This needs to happen _after_ `remove_columns` on our headers!
205228
let spec = spec.convert_to_indices_using_headers(&in_headers)?;
206229

207230
// Decide how big to make our chunks. We want to geocode no more
@@ -228,9 +251,9 @@ fn read_csv_from_stdin(
228251
let mut rows = Vec::with_capacity(chunk_size);
229252
for row in rdr.records() {
230253
let mut row = row?;
231-
if let Some(remove_column_flags) = &remove_column_flags {
254+
if should_remove_columns {
232255
// Strip out any duplicate columns.
233-
row = remove_columns(&row, remove_column_flags);
256+
row = remove_columns(&row, &remove_column_flags);
234257
}
235258
rows.push(row);
236259
if rows.len() >= chunk_size {
@@ -270,6 +293,20 @@ fn read_csv_from_stdin(
270293
Ok(())
271294
}
272295

296+
/// Remove columns from `row` if they're set to true in `remove_column_flags`.
297+
fn remove_columns(row: &StringRecord, remove_column_flags: &[bool]) -> StringRecord {
298+
debug_assert_eq!(row.len(), remove_column_flags.len());
299+
StringRecord::from_iter(row.iter().zip(remove_column_flags).filter_map(
300+
|(value, &remove)| {
301+
if remove {
302+
None
303+
} else {
304+
Some(value.to_owned())
305+
}
306+
},
307+
))
308+
}
309+
273310
/// Receive chunks of a CSV file from `rx` and write them to standard output.
274311
fn write_csv_to_stdout(mut rx: Receiver<Message>) -> Result<()> {
275312
let stdout = io::stdout();
@@ -410,15 +447,3 @@ async fn geocode_chunk(
410447
}
411448
Ok(chunk)
412449
}
413-
414-
fn remove_columns(row: &StringRecord, remove_column_flags: &[bool]) -> StringRecord {
415-
StringRecord::from_iter(row.iter().zip(remove_column_flags).filter_map(
416-
|(value, &remove)| {
417-
if remove {
418-
None
419-
} else {
420-
Some(value.to_owned())
421-
}
422-
},
423-
))
424-
}

src/main.rs

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ mod structure;
1818
mod unpack_vec;
1919

2020
use addresses::AddressColumnSpec;
21-
use geocoder::geocode_stdio;
21+
use geocoder::{geocode_stdio, OnDuplicateColumns};
2222
use smartystreets::MatchStrategy;
2323
use structure::Structure;
2424

@@ -34,10 +34,10 @@ struct Opt {
3434
#[structopt(long = "match", default_value = "strict")]
3535
match_strategy: MatchStrategy,
3636

37-
/// Replace any existing columns with the same name as our geocoding
38-
/// columns.
39-
#[structopt(long = "replace")]
40-
replace: bool,
37+
/// What should we if geocoding output columns have the same names as input
38+
/// columns? [error, replace, append]
39+
#[structopt(long = "duplicate-columns", default_value = "error")]
40+
on_duplicate_columns: OnDuplicateColumns,
4141

4242
/// A JSON file describing what columns to geocode.
4343
#[structopt(long = "spec")]
@@ -58,7 +58,12 @@ fn run() -> Result<()> {
5858
let structure = Structure::complete()?;
5959

6060
// Call our geocoder asynchronously.
61-
let geocode_fut = geocode_stdio(spec, opt.match_strategy, opt.replace, structure);
61+
let geocode_fut = geocode_stdio(
62+
spec,
63+
opt.match_strategy,
64+
opt.on_duplicate_columns,
65+
structure,
66+
);
6267

6368
// Pass our future to our async runtime.
6469
let mut runtime =

tests/about.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
//! Information about this CLI tool.
2+
3+
use cli_test_dir::*;
4+
5+
#[test]
6+
fn help_flag() {
7+
let testdir = TestDir::new("geocode-csv", "help_flag");
8+
let output = testdir.cmd().arg("--help").output().expect_success();
9+
assert!(output.stdout_str().contains("geocode-csv"));
10+
assert!(output.stdout_str().contains("--help"));
11+
}
12+
13+
#[test]
14+
fn version_flag() {
15+
let testdir = TestDir::new("geocode-csv", "version_flag");
16+
let output = testdir.cmd().arg("--version").output().expect_success();
17+
assert!(output.stdout_str().contains("geocode-csv"));
18+
assert!(output.stdout_str().contains(env!("CARGO_PKG_VERSION")));
19+
}

tests/duplicate_columns.rs

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
//! Specifying how to handle duplicate columns.
2+
3+
use cli_test_dir::*;
4+
5+
/// A CSV file to use for our tests.
6+
const SIMPLE_CSV: &str = r#"address,gc_addressee,zip
7+
20 W 34th St,,10118
8+
"#;
9+
10+
/// A spec file to use for our tests.
11+
const SIMPLE_SPEC: &str = r#"{
12+
"gc": {
13+
"house_number_and_street": "address",
14+
"postcode": "zip"
15+
}
16+
}"#;
17+
18+
#[test]
19+
#[ignore]
20+
fn duplicate_columns_error() {
21+
let testdir = TestDir::new("geocode-csv", "duplicate_columns_error");
22+
23+
testdir.create_file("spec.json", SIMPLE_SPEC);
24+
let output = testdir
25+
.cmd()
26+
.arg("--spec=spec.json")
27+
.arg("--duplicate-columns=error")
28+
.output_with_stdin(SIMPLE_CSV)
29+
.expect("could not run geocode-csv");
30+
31+
assert!(!output.status.success());
32+
}
33+
34+
#[test]
35+
#[ignore]
36+
fn duplicate_columns_replace() {
37+
let testdir = TestDir::new("geocode-csv", "duplicate_columns_replace");
38+
39+
testdir.create_file("spec.json", SIMPLE_SPEC);
40+
let output = testdir
41+
.cmd()
42+
.arg("--spec=spec.json")
43+
.arg("--duplicate-columns=replace")
44+
.output_with_stdin(SIMPLE_CSV)
45+
.expect_success();
46+
47+
assert!(output.stdout_str().contains("address,zip,gc_addressee,"));
48+
assert!(output.stdout_str().contains("Commercial"));
49+
}
50+
51+
#[test]
52+
#[ignore]
53+
fn duplicate_columns_append() {
54+
let testdir = TestDir::new("geocode-csv", "duplicate_columns_append");
55+
56+
testdir.create_file("spec.json", SIMPLE_SPEC);
57+
let output = testdir
58+
.cmd()
59+
.arg("--spec=spec.json")
60+
.arg("--duplicate-columns=append")
61+
.output_with_stdin(SIMPLE_CSV)
62+
.expect_success();
63+
64+
assert!(output
65+
.stdout_str()
66+
.contains("address,gc_addressee,zip,gc_addressee,"));
67+
assert!(output.stdout_str().contains("Commercial"));
68+
}

0 commit comments

Comments
 (0)