Skip to content

Commit 1438afc

Browse files
committed
Skip geocoding for records with empty addresses
Geocoding these addresses with Smarty now triggers a hard failure. (cherry picked from commit 2e9ae09)
1 parent dac8def commit 1438afc

File tree

6 files changed

+131
-9
lines changed

6 files changed

+131
-9
lines changed

src/addresses.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,12 @@ pub struct Address {
2727
}
2828

2929
impl Address {
30+
/// A valid `Address` has a non-empty `street` field. (And whitespace
31+
/// doesn't count as non-empty.)
32+
pub fn is_valid(&self) -> bool {
33+
!self.street.trim().is_empty()
34+
}
35+
3036
/// The `city` field, or an empty string.
3137
pub fn city_str(&self) -> &str {
3238
self.city.as_ref().map(|s| &s[..]).unwrap_or("")

src/geocoders/cache/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ impl Geocoder for Cache {
186186
// Encode our value for caching.
187187
let value = retry.as_ref().map(|retry| &retry.column_values);
188188
encoded.clear();
189-
bincode::encode_into_std_write(&value, &mut encoded, bincode_config)
189+
bincode::encode_into_std_write(value, &mut encoded, bincode_config)
190190
.context("could not encode value for caching")?;
191191

192192
// Compress our encoded value and add it to our pipeline set.
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
//! Skip invalid addresses and don't pass them through to the next layer.
2+
3+
use async_trait::async_trait;
4+
use metrics::{counter, describe_counter};
5+
6+
use crate::addresses::Address;
7+
8+
use super::{Geocoded, Geocoder, Result};
9+
10+
/// Skip invalid addresses and don't pass them through to the next layer.
11+
pub struct InvalidRecordSkipper {
12+
// Our inner geocoder that we're normalizing for.
13+
inner: Box<dyn Geocoder>,
14+
}
15+
16+
impl InvalidRecordSkipper {
17+
/// Create a new `Normalizer` wrapping the specified geocoder.
18+
pub fn new(inner: Box<dyn Geocoder>) -> InvalidRecordSkipper {
19+
describe_counter!(
20+
"geocodecsv.invalid_records.total",
21+
"Invalid records which could not be geocoded"
22+
);
23+
24+
InvalidRecordSkipper { inner }
25+
}
26+
}
27+
28+
#[async_trait]
29+
impl Geocoder for InvalidRecordSkipper {
30+
fn tag(&self) -> &str {
31+
// We don't change anything which could possibly affect caching,
32+
// so we can just use our inner tag.
33+
self.inner.tag()
34+
}
35+
36+
fn configuration_key(&self) -> &str {
37+
self.inner.configuration_key()
38+
}
39+
40+
fn column_names(&self) -> &[String] {
41+
self.inner.column_names()
42+
}
43+
44+
async fn geocode_addresses(
45+
&self,
46+
addresses: &[Address],
47+
) -> Result<Vec<Option<Geocoded>>> {
48+
// Extract our valid addresses, keeping track of their original
49+
// positions.
50+
let mut original_indices = vec![];
51+
let mut valid_addresses = vec![];
52+
for (i, address) in addresses.iter().enumerate() {
53+
if address.is_valid() {
54+
valid_addresses.push(address.clone());
55+
original_indices.push(i);
56+
} else {
57+
counter!("geocodecsv.invalid_records.total", 1);
58+
}
59+
}
60+
61+
// Geocode our valid addresses.
62+
let geocodeded = self.inner.geocode_addresses(&valid_addresses).await?;
63+
64+
// Rebuild our geocoded addresses, inserting `None` for invalid.
65+
let mut result = vec![None; addresses.len()];
66+
for (i, geocoded) in geocodeded.into_iter().enumerate() {
67+
let original_index = original_indices[i];
68+
result[original_index] = geocoded;
69+
}
70+
Ok(result)
71+
}
72+
}

src/geocoders/mod.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ use crate::{
1616
};
1717

1818
pub mod cache;
19+
pub mod invalid_record_skipper;
1920
pub mod libpostal;
2021
pub mod normalizer;
2122
pub mod smarty;
@@ -55,6 +56,9 @@ pub enum MatchStrategy {
5556
Enhanced,
5657
}
5758

59+
// `#derive(Default)` would actually do the right thing, but we want to make the
60+
// default explicit for the reader.
61+
#[allow(clippy::derivable_impls)]
5862
impl Default for MatchStrategy {
5963
fn default() -> Self {
6064
MatchStrategy::Strict
@@ -122,7 +126,7 @@ pub trait Geocoder: Send + Sync + 'static {
122126
let mut hasher = Sha256::new();
123127
for column_name in self.column_names() {
124128
hasher.update(column_name.as_bytes());
125-
hasher.update(&[0]);
129+
hasher.update([0]);
126130
}
127131
hasher.update(self.configuration_key());
128132
let hash = hasher.finalize();

src/main.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,6 @@
55

66
pub use anyhow::Result;
77
use anyhow::{format_err, Error};
8-
use geocoders::cache::Cache;
9-
use geocoders::libpostal::LibPostal;
10-
use geocoders::normalizer::Normalizer;
11-
use geocoders::smarty::Smarty;
12-
use geocoders::{shared_http_client, Geocoder};
13-
use key_value_stores::KeyValueStore;
148
use leaky_bucket::RateLimiter;
159
use metrics::describe_counter;
1610
use opinionated_metrics::Mode;
@@ -37,7 +31,12 @@ mod pipeline;
3731
mod unpack_vec;
3832

3933
use crate::addresses::AddressColumnSpec;
40-
use crate::geocoders::MatchStrategy;
34+
use crate::geocoders::{
35+
cache::Cache, invalid_record_skipper::InvalidRecordSkipper, libpostal::LibPostal,
36+
normalizer::Normalizer, shared_http_client, smarty::Smarty, Geocoder,
37+
MatchStrategy,
38+
};
39+
use crate::key_value_stores::KeyValueStore;
4140
use crate::pipeline::{geocode_stdio, OnDuplicateColumns, CONCURRENCY, GEOCODE_SIZE};
4241

4342
/// Underlying geocoders we can use. (Helper struct for argument parsing.)
@@ -237,6 +236,9 @@ async fn main() -> Result<()> {
237236
geocoder = Box::new(Normalizer::new(geocoder));
238237
}
239238

239+
// Always skip invalid records.
240+
geocoder = Box::new(InvalidRecordSkipper::new(geocoder));
241+
240242
// Call our geocoder.
241243
let result = geocode_stdio(
242244
spec,

tests/specs.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,3 +217,41 @@ fn rate_limiter() {
217217
.expect_success();
218218
assert!(output.stdout_str().contains("shipping_addressee"));
219219
}
220+
221+
#[test]
222+
#[ignore]
223+
fn skip_records_with_empty_house_number_and_street() {
224+
let testdir = TestDir::new(
225+
"geocode-csv",
226+
"skip_records_with_empty_house_number_and_street",
227+
);
228+
229+
testdir.create_file(
230+
"spec.json",
231+
r#"{
232+
"shipping": {
233+
"house_number_and_street": [
234+
"address_1",
235+
"address_2"
236+
],
237+
"postcode": "zip_code"
238+
}
239+
}"#,
240+
);
241+
242+
let output = testdir
243+
.cmd()
244+
.arg("--license=us-core-enterprise-cloud")
245+
.arg("--spec=spec.json")
246+
.output_with_stdin(
247+
r#"address_1,address_2,city,state,zip_code
248+
,,New York,NY,10118
249+
, ,Provo,UT,
250+
"#,
251+
)
252+
.expect_success();
253+
// We output all lines, without geocoding any that lack a street address.
254+
assert!(output.stdout_str().contains("shipping_addressee"));
255+
assert!(output.stdout_str().contains("New York"));
256+
assert!(output.stdout_str().contains("Provo"));
257+
}

0 commit comments

Comments
 (0)