Skip to content

Commit f1e49f3

Browse files
committed
add backToBackMapPrimitive in DatasetBenchmark
The backToBackMapPrimitive is almost equivalent with the original backToBackMap implementation for DataFrame
1 parent ca21e67 commit f1e49f3

File tree

1 file changed

+64
-11
lines changed

1 file changed

+64
-11
lines changed

sql/core/src/test/scala/org/apache/spark/sql/DatasetBenchmark.scala

Lines changed: 64 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,47 @@ object DatasetBenchmark {
7272
benchmark
7373
}
7474

75+
def backToBackMapPrimitive(spark: SparkSession, numRows: Long, numChains: Int): Benchmark = {
76+
import spark.implicits._
77+
78+
val df = spark.range(1, numRows).select($"id".as("l"))
79+
val benchmark = new Benchmark("back-to-back map for primitive", numRows)
80+
val func = (d: Long) => d+1
81+
82+
val rdd = spark.sparkContext.range(1, numRows).map(l => l.toLong)
83+
benchmark.addCase("RDD") { iter =>
84+
var res = rdd
85+
var i = 0
86+
while (i < numChains) {
87+
res = rdd.map(func)
88+
i += 1
89+
}
90+
res.foreach(_ => Unit)
91+
}
92+
93+
benchmark.addCase("DataFrame") { iter =>
94+
var res = df
95+
var i = 0
96+
while (i < numChains) {
97+
res = res.select($"l" + 1 as "l")
98+
i += 1
99+
}
100+
res.queryExecution.toRdd.foreach(_ => Unit)
101+
}
102+
103+
benchmark.addCase("Dataset") { iter =>
104+
var res = df.as[Long]
105+
var i = 0
106+
while (i < numChains) {
107+
res = res.map(func)
108+
i += 1
109+
}
110+
res.queryExecution.toRdd.foreach(_ => Unit)
111+
}
112+
113+
benchmark
114+
}
115+
75116
def backToBackFilter(spark: SparkSession, numRows: Long, numChains: Int): Benchmark = {
76117
import spark.implicits._
77118

@@ -166,20 +207,32 @@ object DatasetBenchmark {
166207
val numChains = 10
167208

168209
val benchmark = backToBackMap(spark, numRows, numChains)
169-
val benchmark2 = backToBackFilter(spark, numRows, numChains)
170-
val benchmark3 = aggregate(spark, numRows)
210+
val benchmark2 = backToBackMapPrimitive(spark, numRows, numChains)
211+
val benchmark3 = backToBackFilter(spark, numRows, numChains)
212+
val benchmark4 = aggregate(spark, numRows)
171213

172214
/*
173-
Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.11.4
174-
Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
175-
back-to-back map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
176-
-------------------------------------------------------------------------------------------
177-
RDD 1935 / 2105 51.7 19.3 1.0X
178-
DataFrame 756 / 799 132.3 7.6 2.6X
179-
Dataset 7359 / 7506 13.6 73.6 0.3X
215+
Java HotSpot(TM) 64-Bit Server VM 1.8.0_91-b14 on Mac OS X 10.11.5
216+
Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz
217+
back-to-back map: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
218+
------------------------------------------------------------------------------------------------
219+
RDD 5144 / 5330 19.4 51.4 1.0X
220+
DataFrame 6878 / 7133 14.5 68.8 0.7X
221+
Dataset 13760 / 14076 7.3 137.6 0.4X
180222
*/
181223
benchmark.run()
182224

225+
/*
226+
Java HotSpot(TM) 64-Bit Server VM 1.8.0_91-b14 on Mac OS X 10.11.5
227+
Intel(R) Core(TM) i5-5257U CPU @ 2.70GHz
228+
back-to-back map for primitive: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
229+
------------------------------------------------------------------------------------------------
230+
RDD 1963 / 2172 50.9 19.6 1.0X
231+
DataFrame 1563 / 1746 64.0 15.6 1.3X
232+
Dataset 5640 / 5810 17.7 56.4 0.3X
233+
*/
234+
benchmark2.run()
235+
183236
/*
184237
Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.11.4
185238
Intel(R) Core(TM) i7-4960HQ CPU @ 2.60GHz
@@ -189,7 +242,7 @@ object DatasetBenchmark {
189242
DataFrame 103 / 127 967.4 1.0 19.1X
190243
Dataset 4343 / 4477 23.0 43.4 0.5X
191244
*/
192-
benchmark2.run()
245+
benchmark3.run()
193246

194247
/*
195248
Java HotSpot(TM) 64-Bit Server VM 1.8.0_60-b27 on Mac OS X 10.11.4
@@ -201,6 +254,6 @@ object DatasetBenchmark {
201254
Dataset sum using Aggregator 4111 / 4282 24.3 41.1 0.5X
202255
Dataset complex Aggregator 8782 / 9036 11.4 87.8 0.2X
203256
*/
204-
benchmark3.run()
257+
benchmark4.run()
205258
}
206259
}

0 commit comments

Comments
 (0)