@@ -134,12 +134,11 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
134
134
// SPARK-8501: Empty ORC files always have an empty schema stored in their footer. In this
135
135
// case, `OrcFileOperator.readSchema` returns `None`, and we can't read the underlying file
136
136
// using the given physical schema. Instead, we simply return an empty iterator.
137
- val maybePhysicalSchema = OrcFileOperator .readSchema(Seq (file.filePath), Some (conf))
138
- if (maybePhysicalSchema.isEmpty ) {
137
+ val isEmptyFile = OrcFileOperator .readSchema(Seq (file.filePath), Some (conf)).isEmpty
138
+ if (isEmptyFile ) {
139
139
Iterator .empty
140
140
} else {
141
- val physicalSchema = maybePhysicalSchema.get
142
- OrcRelation .setRequiredColumns(conf, physicalSchema, requiredSchema)
141
+ OrcRelation .setRequiredColumns(conf, dataSchema, requiredSchema)
143
142
144
143
val orcRecordReader = {
145
144
val job = Job .getInstance(conf)
@@ -163,6 +162,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
163
162
// Unwraps `OrcStruct`s to `UnsafeRow`s
164
163
OrcRelation .unwrapOrcStructs(
165
164
conf,
165
+ dataSchema,
166
166
requiredSchema,
167
167
Some (orcRecordReader.getObjectInspector.asInstanceOf [StructObjectInspector ]),
168
168
recordsIterator)
@@ -272,25 +272,32 @@ private[orc] object OrcRelation extends HiveInspectors {
272
272
def unwrapOrcStructs (
273
273
conf : Configuration ,
274
274
dataSchema : StructType ,
275
+ requiredSchema : StructType ,
275
276
maybeStructOI : Option [StructObjectInspector ],
276
277
iterator : Iterator [Writable ]): Iterator [InternalRow ] = {
277
278
val deserializer = new OrcSerde
278
- val mutableRow = new SpecificInternalRow (dataSchema .map(_.dataType))
279
- val unsafeProjection = UnsafeProjection .create(dataSchema )
279
+ val mutableRow = new SpecificInternalRow (requiredSchema .map(_.dataType))
280
+ val unsafeProjection = UnsafeProjection .create(requiredSchema )
280
281
281
282
def unwrap (oi : StructObjectInspector ): Iterator [InternalRow ] = {
282
- val (fieldRefs, fieldOrdinals) = dataSchema.zipWithIndex.map {
283
- case (field, ordinal) => oi.getStructFieldRef(field.name) -> ordinal
283
+ val (fieldRefs, fieldOrdinals) = requiredSchema.zipWithIndex.map {
284
+ case (field, ordinal) =>
285
+ var ref = oi.getStructFieldRef(field.name)
286
+ if (ref == null ) {
287
+ ref = oi.getStructFieldRef(" _col" + dataSchema.fieldIndex(field.name))
288
+ }
289
+ ref -> ordinal
284
290
}.unzip
285
291
286
- val unwrappers = fieldRefs.map(unwrapperFor)
292
+ val unwrappers = fieldRefs.map(r => if (r == null ) null else unwrapperFor(r) )
287
293
288
294
iterator.map { value =>
289
295
val raw = deserializer.deserialize(value)
290
296
var i = 0
291
297
val length = fieldRefs.length
292
298
while (i < length) {
293
- val fieldValue = oi.getStructFieldData(raw, fieldRefs(i))
299
+ val fieldRef = fieldRefs(i)
300
+ val fieldValue = if (fieldRef == null ) null else oi.getStructFieldData(raw, fieldRef)
294
301
if (fieldValue == null ) {
295
302
mutableRow.setNullAt(fieldOrdinals(i))
296
303
} else {
@@ -306,8 +313,8 @@ private[orc] object OrcRelation extends HiveInspectors {
306
313
}
307
314
308
315
def setRequiredColumns (
309
- conf : Configuration , physicalSchema : StructType , requestedSchema : StructType ): Unit = {
310
- val ids = requestedSchema.map(a => physicalSchema .fieldIndex(a.name): Integer )
316
+ conf : Configuration , dataSchema : StructType , requestedSchema : StructType ): Unit = {
317
+ val ids = requestedSchema.map(a => dataSchema .fieldIndex(a.name): Integer )
311
318
val (sortedIDs, sortedNames) = ids.zip(requestedSchema.fieldNames).sorted.unzip
312
319
HiveShim .appendReadColumns(conf, sortedIDs, sortedNames)
313
320
}
0 commit comments