Skip to content

Commit 8f9a656

Browse files
authored
Merge pull request #6 from faradayio/collapse_duplicate_suffix
Collapse duplicate suffixes (fixes #2)
2 parents e794032 + b092b6a commit 8f9a656

File tree

5 files changed

+37
-16
lines changed

5 files changed

+37
-16
lines changed

.travis.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@ script:
2020
# Require clippy to pass without warnings. This also fails for regular Rust
2121
# warnings.
2222
- cargo clippy -- -D warnings
23-
# Run our security, license and duplicate dependency version detector.
24-
- cargo deny check
2523
before_deploy:
2624
#- cargo doc
2725
- ./build-release geocode-csv "${TRAVIS_TAG}-${TRAVIS_OS_NAME}"

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@ FROM ekidd/rust-musl-builder:stable-openssl11
77
ADD . ./
88
RUN sudo chown -R rust:rust .
99

10-
CMD cargo build --release
10+
CMD cargo deny check && cargo build --release

src/addresses.rs

Lines changed: 33 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,43 @@ impl ColumnKeyOrKeys<usize> {
4848
) -> Result<Cow<'a, str>> {
4949
match self {
5050
ColumnKeyOrKeys::Key(key) => Ok(Cow::Borrowed(&record[*key])),
51-
ColumnKeyOrKeys::Keys(keys) => Ok(Cow::Owned(
52-
keys.iter()
53-
.map(|key| &record[*key])
54-
.collect::<Vec<_>>()
55-
.join(" "),
56-
)),
51+
ColumnKeyOrKeys::Keys(keys) => {
52+
// Allocate an empty string with some reserved space so we maybe don't
53+
// need to reallocate it every time we append.
54+
let mut extracted = String::with_capacity(40);
55+
for key in keys {
56+
let s = &record[*key];
57+
if extracted.is_empty() {
58+
extracted.push_str(s);
59+
} else if extracted.ends_with(s) {
60+
// Already there, so ignore it. This appears in a lot of
61+
// real-world databases, for some reason.
62+
} else {
63+
extracted.push_str(" ");
64+
extracted.push_str(s);
65+
}
66+
}
67+
Ok(Cow::Owned(extracted))
68+
}
5769
}
5870
}
5971
}
6072

73+
#[test]
74+
fn extract_collapses_duplicate_suffixes() {
75+
// This seems really arbitrary, but it consistently appears in many
76+
// real-world databases.
77+
//
78+
// I wonder if the equivalent "prefix" case is common?
79+
use std::iter::FromIterator;
80+
let record = StringRecord::from_iter(&["100", "Main Street #302", "#302"]);
81+
let keys = ColumnKeyOrKeys::Keys(vec![0, 1, 2]);
82+
assert_eq!(
83+
keys.extract_from_record(&record).unwrap(),
84+
"100 Main Street #302",
85+
);
86+
}
87+
6188
/// The column names from a CSV file that we want to use as addresses.
6289
///
6390
/// `K` is typically either a `String` (for a column name) or a `usize` (for a

src/async_util.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ where
2222
let thr = thread::Builder::new().name(thread_name);
2323
let handle = thr
2424
.spawn(move || {
25-
if let Err(_) = block_on(sender.send(f())) {
25+
if block_on(sender.send(f())).is_err() {
2626
panic!("should always be able to send results from background thread");
2727
}
2828
})

src/geocoder.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ fn read_csv_from_stdin(
234234
let chunk_size = max(1, GEOCODE_SIZE / spec.prefix_count());
235235

236236
// Build our output headers.
237-
let mut out_headers = in_headers.clone();
237+
let mut out_headers = in_headers;
238238
for prefix in spec.prefixes() {
239239
structure.add_header_columns(prefix, &mut out_headers)?;
240240
}
@@ -275,11 +275,7 @@ fn read_csv_from_stdin(
275275
// rows that haven't been sent yet.
276276
if !sent_chunk || !rows.is_empty() {
277277
trace!("sending final {} input rows", rows.len());
278-
block_on(tx.send(Message::Chunk(Chunk {
279-
shared: shared.clone(),
280-
rows,
281-
})))
282-
.map_err(|_| {
278+
block_on(tx.send(Message::Chunk(Chunk { shared, rows }))).map_err(|_| {
283279
format_err!("could not send rows to geocoder (perhaps it failed)")
284280
})?;
285281
}

0 commit comments

Comments
 (0)