Skip to content

Commit ac4428a

Browse files
committed
Filter records with empty streets after normalization
This fixes an issue where normalization introduced empty streets.
1 parent 1e2150b commit ac4428a

File tree

4 files changed

+67
-7
lines changed

4 files changed

+67
-7
lines changed

src/addresses.rs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,20 @@ impl Address {
5757
}
5858
}
5959

60+
#[test]
61+
fn address_is_valid_does_not_allow_empty_streets() {
62+
let address_for = |street: &str| Address {
63+
street: street.to_owned(),
64+
city: None,
65+
state: None,
66+
zipcode: None,
67+
};
68+
assert!(!address_for("").is_valid());
69+
assert!(!address_for(" ").is_valid());
70+
assert!(!address_for(" \t\n ").is_valid());
71+
assert!(address_for("123 Main Street").is_valid());
72+
}
73+
6074
/// Either a column name, or a list of names.
6175
///
6276
/// `K` is typically either a `String` (for a column name) or a `usize` (for a

src/geocoders/smarty/client.rs

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use futures::stream::StreamExt;
88
use hyper::{Body, Request};
99
use metrics::{counter, describe_histogram, histogram, Unit};
1010
use serde::{Deserialize, Serialize};
11-
use tracing::instrument;
11+
use tracing::{error, instrument};
1212
use url::Url;
1313

1414
use crate::addresses::Address;
@@ -160,11 +160,49 @@ async fn street_addresses_impl(
160160
Ok(unpack_vec(resps, requests.len(), |resp| resp.input_index)?)
161161
} else {
162162
// This error was reported by the remote server.
163+
164+
// Add error to metrics.
163165
counter!("geocodecsv.selected_errors.count", 1, "component" => "smarty", "cause" => status.to_string());
166+
167+
// Log information about bad street fields, if we can.
168+
if status == 422 {
169+
if let Ok(error_response) =
170+
serde_json::from_slice::<SmartyErrorResponse>(&body_data)
171+
{
172+
let mut missing_street = false;
173+
for error in error_response.errors {
174+
if error.name == "us-street-api:query-missing-street" {
175+
missing_street = true;
176+
break;
177+
}
178+
}
179+
if missing_street {
180+
let streets = requests
181+
.iter()
182+
.map(|req| req.address.street.to_owned())
183+
.collect::<Vec<_>>();
184+
error!("At least one missing street in: {:?}", streets);
185+
}
186+
}
187+
}
188+
189+
// Convert to a Rust error.
164190
Err(format_err!(
165191
"geocoding error: {}\n{}",
166192
status,
167193
String::from_utf8_lossy(&body_data),
168194
))
169195
}
170196
}
197+
198+
/// Smarty error response body.
199+
#[derive(Debug, Deserialize)]
200+
struct SmartyErrorResponse {
201+
errors: Vec<SmartyError>,
202+
}
203+
204+
/// Smarty error.
205+
#[derive(Debug, Deserialize)]
206+
struct SmartyError {
207+
name: String,
208+
}

src/geocoders/smarty/mod.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,14 @@ impl Geocoder for Smarty {
8484
&self,
8585
addresses: &[Address],
8686
) -> Result<Vec<Option<Geocoded>>> {
87+
// An array used to store our geocoding results.
88+
let mut geocoded = vec![None; addresses.len()];
89+
90+
// If we have nothing to do, exit immediately.
91+
if addresses.is_empty() {
92+
return Ok(geocoded);
93+
}
94+
8795
// If we have a rate limiter, ask for permission to geocode the
8896
// specified number of addresses. We only check if we have one, in order
8997
// minimize thread synchronization costs.
@@ -111,9 +119,8 @@ impl Geocoder for Smarty {
111119
counter!("geocodecsv.addresses_geocoded.total", hits as u64, "geocoder" => "smarty", "geocode_result" => "found");
112120
counter!("geocodecsv.addresses_geocoded.total", (addresses.len() - hits) as u64, "geocoder" => "smarty", "geocode_result" => "unknown_address");
113121

114-
// Build an array containing only the addresses that actually geocoded
115-
// successfully.
116-
let mut geocoded = vec![None; addresses.len()];
122+
// Response with only the addresses that actually geocoded successfully.
123+
// We need make sure we map outputs back to their original positions.
117124
for address_output in response.into_iter().flatten() {
118125
let column_values =
119126
self.structure.value_columns_for(&address_output.fields)?;

src/main.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,14 +237,15 @@ async fn main() -> Result<()> {
237237
);
238238
}
239239

240+
// Always skip invalid records. This needs to happen after we do
241+
// normalization, because normalization might move data between fields.
242+
geocoder = Box::new(InvalidRecordSkipper::new(geocoder));
243+
240244
// If we were asked, normalize addresses a bit first.
241245
if opt.normalize {
242246
geocoder = Box::new(Normalizer::new(geocoder));
243247
}
244248

245-
// Always skip invalid records.
246-
geocoder = Box::new(InvalidRecordSkipper::new(geocoder));
247-
248249
// Call our geocoder.
249250
let result = geocode_stdio(
250251
spec,

0 commit comments

Comments
 (0)