Skip to content

Commit cc0caa6

Browse files
rxinyhuai
authored andcommitted
[SPARK-17270][SQL] Move object optimization rules into its own file
## What changes were proposed in this pull request? As part of breaking Optimizer.scala apart, this patch moves various Dataset object optimization rules into a single file. I'm submitting separate pull requests so we can more easily merge this in branch-2.0 to simplify optimizer backports. ## How was this patch tested? This should be covered by existing tests. Author: Reynold Xin <rxin@databricks.com> Closes apache#14839 from rxin/SPARK-17270.
1 parent a6bca3a commit cc0caa6

File tree

2 files changed

+98
-71
lines changed

2 files changed

+98
-71
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala

Lines changed: 0 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -201,43 +201,6 @@ object RemoveAliasOnlyProject extends Rule[LogicalPlan] {
201201
}
202202
}
203203

204-
/**
205-
* Removes cases where we are unnecessarily going between the object and serialized (InternalRow)
206-
* representation of data item. For example back to back map operations.
207-
*/
208-
object EliminateSerialization extends Rule[LogicalPlan] {
209-
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
210-
case d @ DeserializeToObject(_, _, s: SerializeFromObject)
211-
if d.outputObjAttr.dataType == s.inputObjAttr.dataType =>
212-
// Adds an extra Project here, to preserve the output expr id of `DeserializeToObject`.
213-
// We will remove it later in RemoveAliasOnlyProject rule.
214-
val objAttr = Alias(s.inputObjAttr, s.inputObjAttr.name)(exprId = d.outputObjAttr.exprId)
215-
Project(objAttr :: Nil, s.child)
216-
217-
case a @ AppendColumns(_, _, _, _, _, s: SerializeFromObject)
218-
if a.deserializer.dataType == s.inputObjAttr.dataType =>
219-
AppendColumnsWithObject(a.func, s.serializer, a.serializer, s.child)
220-
221-
// If there is a `SerializeFromObject` under typed filter and its input object type is same with
222-
// the typed filter's deserializer, we can convert typed filter to normal filter without
223-
// deserialization in condition, and push it down through `SerializeFromObject`.
224-
// e.g. `ds.map(...).filter(...)` can be optimized by this rule to save extra deserialization,
225-
// but `ds.map(...).as[AnotherType].filter(...)` can not be optimized.
226-
case f @ TypedFilter(_, _, _, _, s: SerializeFromObject)
227-
if f.deserializer.dataType == s.inputObjAttr.dataType =>
228-
s.copy(child = f.withObjectProducerChild(s.child))
229-
230-
// If there is a `DeserializeToObject` upon typed filter and its output object type is same with
231-
// the typed filter's deserializer, we can convert typed filter to normal filter without
232-
// deserialization in condition, and pull it up through `DeserializeToObject`.
233-
// e.g. `ds.filter(...).map(...)` can be optimized by this rule to save extra deserialization,
234-
// but `ds.filter(...).as[AnotherType].map(...)` can not be optimized.
235-
case d @ DeserializeToObject(_, _, f: TypedFilter)
236-
if d.outputObjAttr.dataType == f.deserializer.dataType =>
237-
f.withObjectProducerChild(d.copy(child = f.child))
238-
}
239-
}
240-
241204
/**
242205
* Pushes down [[LocalLimit]] beneath UNION ALL and beneath the streamed inputs of outer joins.
243206
*/
@@ -1713,40 +1676,6 @@ case class GetCurrentDatabase(sessionCatalog: SessionCatalog) extends Rule[Logic
17131676
}
17141677
}
17151678

1716-
/**
1717-
* Combines two adjacent [[TypedFilter]]s, which operate on same type object in condition, into one,
1718-
* mering the filter functions into one conjunctive function.
1719-
*/
1720-
object CombineTypedFilters extends Rule[LogicalPlan] {
1721-
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
1722-
case t1 @ TypedFilter(_, _, _, _, t2 @ TypedFilter(_, _, _, _, child))
1723-
if t1.deserializer.dataType == t2.deserializer.dataType =>
1724-
TypedFilter(
1725-
combineFilterFunction(t2.func, t1.func),
1726-
t1.argumentClass,
1727-
t1.argumentSchema,
1728-
t1.deserializer,
1729-
child)
1730-
}
1731-
1732-
private def combineFilterFunction(func1: AnyRef, func2: AnyRef): Any => Boolean = {
1733-
(func1, func2) match {
1734-
case (f1: FilterFunction[_], f2: FilterFunction[_]) =>
1735-
input => f1.asInstanceOf[FilterFunction[Any]].call(input) &&
1736-
f2.asInstanceOf[FilterFunction[Any]].call(input)
1737-
case (f1: FilterFunction[_], f2) =>
1738-
input => f1.asInstanceOf[FilterFunction[Any]].call(input) &&
1739-
f2.asInstanceOf[Any => Boolean](input)
1740-
case (f1, f2: FilterFunction[_]) =>
1741-
input => f1.asInstanceOf[Any => Boolean].apply(input) &&
1742-
f2.asInstanceOf[FilterFunction[Any]].call(input)
1743-
case (f1, f2) =>
1744-
input => f1.asInstanceOf[Any => Boolean].apply(input) &&
1745-
f2.asInstanceOf[Any => Boolean].apply(input)
1746-
}
1747-
}
1748-
}
1749-
17501679
/**
17511680
* This rule rewrites predicate sub-queries into left semi/anti joins. The following predicates
17521681
* are supported:
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.catalyst.optimizer
19+
20+
import org.apache.spark.api.java.function.FilterFunction
21+
import org.apache.spark.sql.catalyst.expressions._
22+
import org.apache.spark.sql.catalyst.plans.logical._
23+
import org.apache.spark.sql.catalyst.rules._
24+
25+
/*
26+
* This file defines optimization rules related to object manipulation (for the Dataset API).
27+
*/
28+
29+
/**
30+
* Removes cases where we are unnecessarily going between the object and serialized (InternalRow)
31+
* representation of data item. For example back to back map operations.
32+
*/
33+
object EliminateSerialization extends Rule[LogicalPlan] {
34+
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
35+
case d @ DeserializeToObject(_, _, s: SerializeFromObject)
36+
if d.outputObjAttr.dataType == s.inputObjAttr.dataType =>
37+
// Adds an extra Project here, to preserve the output expr id of `DeserializeToObject`.
38+
// We will remove it later in RemoveAliasOnlyProject rule.
39+
val objAttr = Alias(s.inputObjAttr, s.inputObjAttr.name)(exprId = d.outputObjAttr.exprId)
40+
Project(objAttr :: Nil, s.child)
41+
42+
case a @ AppendColumns(_, _, _, _, _, s: SerializeFromObject)
43+
if a.deserializer.dataType == s.inputObjAttr.dataType =>
44+
AppendColumnsWithObject(a.func, s.serializer, a.serializer, s.child)
45+
46+
// If there is a `SerializeFromObject` under typed filter and its input object type is same with
47+
// the typed filter's deserializer, we can convert typed filter to normal filter without
48+
// deserialization in condition, and push it down through `SerializeFromObject`.
49+
// e.g. `ds.map(...).filter(...)` can be optimized by this rule to save extra deserialization,
50+
// but `ds.map(...).as[AnotherType].filter(...)` can not be optimized.
51+
case f @ TypedFilter(_, _, _, _, s: SerializeFromObject)
52+
if f.deserializer.dataType == s.inputObjAttr.dataType =>
53+
s.copy(child = f.withObjectProducerChild(s.child))
54+
55+
// If there is a `DeserializeToObject` upon typed filter and its output object type is same with
56+
// the typed filter's deserializer, we can convert typed filter to normal filter without
57+
// deserialization in condition, and pull it up through `DeserializeToObject`.
58+
// e.g. `ds.filter(...).map(...)` can be optimized by this rule to save extra deserialization,
59+
// but `ds.filter(...).as[AnotherType].map(...)` can not be optimized.
60+
case d @ DeserializeToObject(_, _, f: TypedFilter)
61+
if d.outputObjAttr.dataType == f.deserializer.dataType =>
62+
f.withObjectProducerChild(d.copy(child = f.child))
63+
}
64+
}
65+
66+
/**
67+
* Combines two adjacent [[TypedFilter]]s, which operate on same type object in condition, into one,
68+
* mering the filter functions into one conjunctive function.
69+
*/
70+
object CombineTypedFilters extends Rule[LogicalPlan] {
71+
def apply(plan: LogicalPlan): LogicalPlan = plan transform {
72+
case t1 @ TypedFilter(_, _, _, _, t2 @ TypedFilter(_, _, _, _, child))
73+
if t1.deserializer.dataType == t2.deserializer.dataType =>
74+
TypedFilter(
75+
combineFilterFunction(t2.func, t1.func),
76+
t1.argumentClass,
77+
t1.argumentSchema,
78+
t1.deserializer,
79+
child)
80+
}
81+
82+
private def combineFilterFunction(func1: AnyRef, func2: AnyRef): Any => Boolean = {
83+
(func1, func2) match {
84+
case (f1: FilterFunction[_], f2: FilterFunction[_]) =>
85+
input => f1.asInstanceOf[FilterFunction[Any]].call(input) &&
86+
f2.asInstanceOf[FilterFunction[Any]].call(input)
87+
case (f1: FilterFunction[_], f2) =>
88+
input => f1.asInstanceOf[FilterFunction[Any]].call(input) &&
89+
f2.asInstanceOf[Any => Boolean](input)
90+
case (f1, f2: FilterFunction[_]) =>
91+
input => f1.asInstanceOf[Any => Boolean].apply(input) &&
92+
f2.asInstanceOf[FilterFunction[Any]].call(input)
93+
case (f1, f2) =>
94+
input => f1.asInstanceOf[Any => Boolean].apply(input) &&
95+
f2.asInstanceOf[Any => Boolean].apply(input)
96+
}
97+
}
98+
}

0 commit comments

Comments
 (0)