@@ -36,6 +36,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
36
36
import org .apache .spark .sql .catalyst .expressions ._
37
37
import org .apache .spark .sql .catalyst .expressions .BindReferences .bindReferences
38
38
import org .apache .spark .sql .catalyst .util .{CaseInsensitiveMap , DateTimeUtils }
39
+ import org .apache .spark .sql .connector .write .WriterCommitMessage
39
40
import org .apache .spark .sql .errors .QueryExecutionErrors
40
41
import org .apache .spark .sql .execution .{ProjectExec , SortExec , SparkPlan , SQLExecution , UnsafeExternalRowSorter }
41
42
import org .apache .spark .sql .internal .SQLConf
@@ -103,14 +104,6 @@ object FileFormatWriter extends Logging {
103
104
.map(FileSourceMetadataAttribute .cleanupFileSourceMetadataInformation))
104
105
val dataColumns = finalOutputSpec.outputColumns.filterNot(partitionSet.contains)
105
106
106
- val hasEmpty2Null = plan.exists(p => V1WritesUtils .hasEmptyToNull(p.expressions))
107
- val empty2NullPlan = if (hasEmpty2Null) {
108
- plan
109
- } else {
110
- val projectList = V1WritesUtils .convertEmptyToNull(plan.output, partitionColumns)
111
- if (projectList.nonEmpty) ProjectExec (projectList, plan) else plan
112
- }
113
-
114
107
val writerBucketSpec = V1WritesUtils .getWriterBucketSpec(bucketSpec, dataColumns, options)
115
108
val sortColumns = V1WritesUtils .getBucketSortColumns(bucketSpec, dataColumns)
116
109
@@ -144,9 +137,10 @@ object FileFormatWriter extends Logging {
144
137
// columns.
145
138
val requiredOrdering = partitionColumns.drop(numStaticPartitionCols) ++
146
139
writerBucketSpec.map(_.bucketIdExpression) ++ sortColumns
140
+ val writeFilesOpt = V1WritesUtils .getWriteFilesOpt(plan)
147
141
// the sort order doesn't matter
148
142
// Use the output ordering from the original plan before adding the empty2null projection.
149
- val actualOrdering = plan.outputOrdering.map(_.child)
143
+ val actualOrdering = writeFilesOpt.map(_.child).getOrElse( plan) .outputOrdering.map(_.child)
150
144
val orderingMatched = V1WritesUtils .isOrderingMatched(requiredOrdering, actualOrdering)
151
145
152
146
SQLExecution .checkSQLExecutionId(sparkSession)
@@ -155,10 +149,6 @@ object FileFormatWriter extends Logging {
155
149
// get an ID guaranteed to be unique.
156
150
job.getConfiguration.set(" spark.sql.sources.writeJobUUID" , description.uuid)
157
151
158
- // This call shouldn't be put into the `try` block below because it only initializes and
159
- // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called.
160
- committer.setupJob(job)
161
-
162
152
// When `PLANNED_WRITE_ENABLED` is true, the optimizer rule V1Writes will add logical sort
163
153
// operator based on the required ordering of the V1 write command. So the output
164
154
// ordering of the physical plan should always match the required ordering. Here
@@ -169,27 +159,55 @@ object FileFormatWriter extends Logging {
169
159
// V1 write command will be empty).
170
160
if (Utils .isTesting) outputOrderingMatched = orderingMatched
171
161
172
- try {
162
+ if (writeFilesOpt.isDefined) {
163
+ // build `WriteFilesSpec` for `WriteFiles`
164
+ val concurrentOutputWriterSpecFunc = (plan : SparkPlan ) => {
165
+ val sortPlan = createSortPlan(plan, requiredOrdering, outputSpec)
166
+ createConcurrentOutputWriterSpec(sparkSession, sortPlan, sortColumns)
167
+ }
168
+ val writeSpec = WriteFilesSpec (
169
+ description = description,
170
+ committer = committer,
171
+ concurrentOutputWriterSpecFunc = concurrentOutputWriterSpecFunc
172
+ )
173
+ executeWrite(sparkSession, plan, writeSpec, job)
174
+ } else {
175
+ executeWrite(sparkSession, plan, job, description, committer, outputSpec,
176
+ requiredOrdering, partitionColumns, sortColumns, orderingMatched)
177
+ }
178
+ }
179
+ // scalastyle:on argcount
180
+
181
+ private def executeWrite (
182
+ sparkSession : SparkSession ,
183
+ plan : SparkPlan ,
184
+ job : Job ,
185
+ description : WriteJobDescription ,
186
+ committer : FileCommitProtocol ,
187
+ outputSpec : OutputSpec ,
188
+ requiredOrdering : Seq [Expression ],
189
+ partitionColumns : Seq [Attribute ],
190
+ sortColumns : Seq [Attribute ],
191
+ orderingMatched : Boolean ): Set [String ] = {
192
+ val hasEmpty2Null = plan.exists(p => V1WritesUtils .hasEmptyToNull(p.expressions))
193
+ val empty2NullPlan = if (hasEmpty2Null) {
194
+ plan
195
+ } else {
196
+ val projectList = V1WritesUtils .convertEmptyToNull(plan.output, partitionColumns)
197
+ if (projectList.nonEmpty) ProjectExec (projectList, plan) else plan
198
+ }
199
+
200
+ writeAndCommit(job, description, committer) {
173
201
val (rdd, concurrentOutputWriterSpec) = if (orderingMatched) {
174
202
(empty2NullPlan.execute(), None )
175
203
} else {
176
- // SPARK-21165: the `requiredOrdering` is based on the attributes from analyzed plan, and
177
- // the physical plan may have different attribute ids due to optimizer removing some
178
- // aliases. Here we bind the expression ahead to avoid potential attribute ids mismatch.
179
- val orderingExpr = bindReferences(
180
- requiredOrdering.map(SortOrder (_, Ascending )), finalOutputSpec.outputColumns)
181
- val sortPlan = SortExec (
182
- orderingExpr,
183
- global = false ,
184
- child = empty2NullPlan)
185
-
186
- val maxWriters = sparkSession.sessionState.conf.maxConcurrentOutputFileWriters
187
- val concurrentWritersEnabled = maxWriters > 0 && sortColumns.isEmpty
188
- if (concurrentWritersEnabled) {
189
- (empty2NullPlan.execute(),
190
- Some (ConcurrentOutputWriterSpec (maxWriters, () => sortPlan.createSorter())))
204
+ val sortPlan = createSortPlan(empty2NullPlan, requiredOrdering, outputSpec)
205
+ val concurrentOutputWriterSpec = createConcurrentOutputWriterSpec(
206
+ sparkSession, sortPlan, sortColumns)
207
+ if (concurrentOutputWriterSpec.isDefined) {
208
+ (empty2NullPlan.execute(), concurrentOutputWriterSpec)
191
209
} else {
192
- (sortPlan.execute(), None )
210
+ (sortPlan.execute(), concurrentOutputWriterSpec )
193
211
}
194
212
}
195
213
@@ -221,7 +239,19 @@ object FileFormatWriter extends Logging {
221
239
committer.onTaskCommit(res.commitMsg)
222
240
ret(index) = res
223
241
})
242
+ ret
243
+ }
244
+ }
224
245
246
+ private def writeAndCommit (
247
+ job : Job ,
248
+ description : WriteJobDescription ,
249
+ committer : FileCommitProtocol )(f : => Array [WriteTaskResult ]): Set [String ] = {
250
+ // This call shouldn't be put into the `try` block below because it only initializes and
251
+ // prepares the job, any exception thrown from here shouldn't cause abortJob() to be called.
252
+ committer.setupJob(job)
253
+ try {
254
+ val ret = f
225
255
val commitMsgs = ret.map(_.commitMsg)
226
256
227
257
logInfo(s " Start to commit write Job ${description.uuid}. " )
@@ -239,10 +269,70 @@ object FileFormatWriter extends Logging {
239
269
throw cause
240
270
}
241
271
}
242
- // scalastyle:on argcount
272
+
273
+ /**
274
+ * Write files using [[SparkPlan.executeWrite ]]
275
+ */
276
+ private def executeWrite (
277
+ session : SparkSession ,
278
+ planForWrites : SparkPlan ,
279
+ writeFilesSpec : WriteFilesSpec ,
280
+ job : Job ): Set [String ] = {
281
+ val committer = writeFilesSpec.committer
282
+ val description = writeFilesSpec.description
283
+
284
+ writeAndCommit(job, description, committer) {
285
+ val rdd = planForWrites.executeWrite(writeFilesSpec)
286
+ val ret = new Array [WriteTaskResult ](rdd.partitions.length)
287
+ session.sparkContext.runJob(
288
+ rdd,
289
+ (context : TaskContext , iter : Iterator [WriterCommitMessage ]) => {
290
+ assert(iter.hasNext)
291
+ val commitMessage = iter.next()
292
+ assert(! iter.hasNext)
293
+ commitMessage
294
+ },
295
+ rdd.partitions.indices,
296
+ (index, res : WriterCommitMessage ) => {
297
+ assert(res.isInstanceOf [WriteTaskResult ])
298
+ val writeTaskResult = res.asInstanceOf [WriteTaskResult ]
299
+ committer.onTaskCommit(writeTaskResult.commitMsg)
300
+ ret(index) = writeTaskResult
301
+ })
302
+ ret
303
+ }
304
+ }
305
+
306
+ private def createSortPlan (
307
+ plan : SparkPlan ,
308
+ requiredOrdering : Seq [Expression ],
309
+ outputSpec : OutputSpec ): SortExec = {
310
+ // SPARK-21165: the `requiredOrdering` is based on the attributes from analyzed plan, and
311
+ // the physical plan may have different attribute ids due to optimizer removing some
312
+ // aliases. Here we bind the expression ahead to avoid potential attribute ids mismatch.
313
+ val orderingExpr = bindReferences(
314
+ requiredOrdering.map(SortOrder (_, Ascending )), outputSpec.outputColumns)
315
+ SortExec (
316
+ orderingExpr,
317
+ global = false ,
318
+ child = plan)
319
+ }
320
+
321
+ private def createConcurrentOutputWriterSpec (
322
+ sparkSession : SparkSession ,
323
+ sortPlan : SortExec ,
324
+ sortColumns : Seq [Attribute ]): Option [ConcurrentOutputWriterSpec ] = {
325
+ val maxWriters = sparkSession.sessionState.conf.maxConcurrentOutputFileWriters
326
+ val concurrentWritersEnabled = maxWriters > 0 && sortColumns.isEmpty
327
+ if (concurrentWritersEnabled) {
328
+ Some (ConcurrentOutputWriterSpec (maxWriters, () => sortPlan.createSorter()))
329
+ } else {
330
+ None
331
+ }
332
+ }
243
333
244
334
/** Writes data out in a single Spark task. */
245
- private def executeTask (
335
+ private [spark] def executeTask (
246
336
description : WriteJobDescription ,
247
337
jobTrackerID : String ,
248
338
sparkStageId : Int ,
0 commit comments