Skip to content

Commit 660c81c

Browse files
committed
ARROW-11414: [Rust] Reduce copies in Schema::try_merge
I was looking at this code yesterday while using it in IOx -- influxdata/influxdb_iox#703 ## Rationale: Even though `Schema::try_merge` requires a slice of `Schema`s (not schema refs) ownership of its inputs, it copies all of its fields. This is inefficient ideal in the common case where most of the fields in the merged `Schema` will be the same ## Changes: This PR proposes to change the implementation so that `try_merge` takes something (like a `Vec`) that can iterate over the Schemas passed in and consume them, avoiding at least one copy per unique field. I intend no algorithmic changes, only performance improvement. Closes #9347 from alamb/alamb/less-copy-in-merge Authored-by: Andrew Lamb <andrew@nerdnetworks.org> Signed-off-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 0091260 commit 660c81c

File tree

2 files changed

+34
-37
lines changed

2 files changed

+34
-37
lines changed

rust/arrow/src/csv/reader.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ pub fn infer_schema_from_files(
222222
}
223223
}
224224

225-
Schema::try_merge(&schemas)
225+
Schema::try_merge(schemas)
226226
}
227227

228228
// optional bounds of the reader, of the form (min line, max line).

rust/arrow/src/datatypes.rs

Lines changed: 33 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1823,7 +1823,7 @@ impl Schema {
18231823
/// ```
18241824
/// use arrow::datatypes::*;
18251825
///
1826-
/// let merged = Schema::try_merge(&vec![
1826+
/// let merged = Schema::try_merge(vec![
18271827
/// Schema::new(vec![
18281828
/// Field::new("c1", DataType::Int64, false),
18291829
/// Field::new("c2", DataType::Utf8, false),
@@ -1844,44 +1844,40 @@ impl Schema {
18441844
/// ]),
18451845
/// );
18461846
/// ```
1847-
pub fn try_merge(schemas: &[Self]) -> Result<Self> {
1848-
let mut merged = Self::empty();
1849-
1850-
for schema in schemas {
1851-
for (key, value) in schema.metadata.iter() {
1852-
// merge metadata
1853-
match merged.metadata.get(key) {
1854-
Some(old_val) => {
1855-
if old_val != value {
1847+
pub fn try_merge(schemas: impl IntoIterator<Item = Self>) -> Result<Self> {
1848+
schemas
1849+
.into_iter()
1850+
.try_fold(Self::empty(), |mut merged, schema| {
1851+
let Schema { metadata, fields } = schema;
1852+
for (key, value) in metadata.into_iter() {
1853+
// merge metadata
1854+
if let Some(old_val) = merged.metadata.get(&key) {
1855+
if old_val != &value {
18561856
return Err(ArrowError::SchemaError(
1857-
"Fail to merge schema due to conflicting metadata"
1857+
"Fail to merge schema due to conflicting metadata."
18581858
.to_string(),
18591859
));
18601860
}
18611861
}
1862-
None => {
1863-
merged.metadata.insert(key.clone(), value.clone());
1864-
}
1862+
merged.metadata.insert(key, value);
18651863
}
1866-
}
1867-
// merge fields
1868-
for field in &schema.fields {
1869-
let mut new_field = true;
1870-
for merged_field in &mut merged.fields {
1871-
if field.name != merged_field.name {
1872-
continue;
1864+
// merge fields
1865+
for field in fields.into_iter() {
1866+
let mut new_field = true;
1867+
for merged_field in &mut merged.fields {
1868+
if field.name != merged_field.name {
1869+
continue;
1870+
}
1871+
new_field = false;
1872+
merged_field.try_merge(&field)?
1873+
}
1874+
// found a new field, add to field list
1875+
if new_field {
1876+
merged.fields.push(field);
18731877
}
1874-
new_field = false;
1875-
merged_field.try_merge(field)?
1876-
}
1877-
// found a new field, add to field list
1878-
if new_field {
1879-
merged.fields.push(field.clone());
18801878
}
1881-
}
1882-
}
1883-
1884-
Ok(merged)
1879+
Ok(merged)
1880+
})
18851881
}
18861882

18871883
/// Returns an immutable reference of the vector of `Field` instances.
@@ -3031,7 +3027,8 @@ mod tests {
30313027
f2.set_metadata(Some(metadata2));
30323028

30333029
assert!(
3034-
Schema::try_merge(&[Schema::new(vec![f1]), Schema::new(vec![f2])]).is_err()
3030+
Schema::try_merge(vec![Schema::new(vec![f1]), Schema::new(vec![f2])])
3031+
.is_err()
30353032
);
30363033

30373034
// 2. None + Some
@@ -3105,7 +3102,7 @@ mod tests {
31053102

31063103
#[test]
31073104
fn test_schema_merge() -> Result<()> {
3108-
let merged = Schema::try_merge(&[
3105+
let merged = Schema::try_merge(vec![
31093106
Schema::new(vec![
31103107
Field::new("first_name", DataType::Utf8, false),
31113108
Field::new("last_name", DataType::Utf8, false),
@@ -3164,7 +3161,7 @@ mod tests {
31643161

31653162
// support merge union fields
31663163
assert_eq!(
3167-
Schema::try_merge(&[
3164+
Schema::try_merge(vec![
31683165
Schema::new(vec![Field::new(
31693166
"c1",
31703167
DataType::Union(vec![
@@ -3194,7 +3191,7 @@ mod tests {
31943191
);
31953192

31963193
// incompatible field should throw error
3197-
assert!(Schema::try_merge(&[
3194+
assert!(Schema::try_merge(vec![
31983195
Schema::new(vec![
31993196
Field::new("first_name", DataType::Utf8, false),
32003197
Field::new("last_name", DataType::Utf8, false),
@@ -3204,7 +3201,7 @@ mod tests {
32043201
.is_err());
32053202

32063203
// incompatible metadata should throw error
3207-
assert!(Schema::try_merge(&[
3204+
assert!(Schema::try_merge(vec![
32083205
Schema::new_with_metadata(
32093206
vec![Field::new("first_name", DataType::Utf8, false)],
32103207
[("foo".to_string(), "bar".to_string()),]

0 commit comments

Comments
 (0)