1717
1818package org .apache .spark .sql .execution .datasources .parquet
1919
20- import java .util .{Map => JMap , TimeZone }
20+ import java .util .{Locale , Map => JMap , TimeZone }
2121
2222import scala .collection .JavaConverters ._
2323
@@ -30,6 +30,7 @@ import org.apache.parquet.schema.Type.Repetition
3030
3131import org .apache .spark .internal .Logging
3232import org .apache .spark .sql .catalyst .expressions .UnsafeRow
33+ import org .apache .spark .sql .internal .SQLConf
3334import org .apache .spark .sql .types ._
3435
3536/**
@@ -71,8 +72,10 @@ private[parquet] class ParquetReadSupport(val convertTz: Option[TimeZone])
7172 StructType .fromString(schemaString)
7273 }
7374
74- val parquetRequestedSchema =
75- ParquetReadSupport .clipParquetSchema(context.getFileSchema, catalystRequestedSchema)
75+ val caseSensitive = context.getConfiguration.getBoolean(SQLConf .CASE_SENSITIVE .key,
76+ SQLConf .CASE_SENSITIVE .defaultValue.get)
77+ val parquetRequestedSchema = ParquetReadSupport .clipParquetSchema(
78+ context.getFileSchema, catalystRequestedSchema, caseSensitive)
7679
7780 new ReadContext (parquetRequestedSchema, Map .empty[String , String ].asJava)
7881 }
@@ -117,8 +120,12 @@ private[parquet] object ParquetReadSupport {
117120 * Tailors `parquetSchema` according to `catalystSchema` by removing column paths don't exist
118121 * in `catalystSchema`, and adding those only exist in `catalystSchema`.
119122 */
120- def clipParquetSchema (parquetSchema : MessageType , catalystSchema : StructType ): MessageType = {
121- val clippedParquetFields = clipParquetGroupFields(parquetSchema.asGroupType(), catalystSchema)
123+ def clipParquetSchema (
124+ parquetSchema : MessageType ,
125+ catalystSchema : StructType ,
126+ caseSensitive : Boolean = true ): MessageType = {
127+ val clippedParquetFields = clipParquetGroupFields(
128+ parquetSchema.asGroupType(), catalystSchema, caseSensitive)
122129 if (clippedParquetFields.isEmpty) {
123130 ParquetSchemaConverter .EMPTY_MESSAGE
124131 } else {
@@ -129,20 +136,21 @@ private[parquet] object ParquetReadSupport {
129136 }
130137 }
131138
132- private def clipParquetType (parquetType : Type , catalystType : DataType ): Type = {
139+ private def clipParquetType (
140+ parquetType : Type , catalystType : DataType , caseSensitive : Boolean ): Type = {
133141 catalystType match {
134142 case t : ArrayType if ! isPrimitiveCatalystType(t.elementType) =>
135143 // Only clips array types with nested type as element type.
136- clipParquetListType(parquetType.asGroupType(), t.elementType)
144+ clipParquetListType(parquetType.asGroupType(), t.elementType, caseSensitive )
137145
138146 case t : MapType
139147 if ! isPrimitiveCatalystType(t.keyType) ||
140148 ! isPrimitiveCatalystType(t.valueType) =>
141149 // Only clips map types with nested key type or value type
142- clipParquetMapType(parquetType.asGroupType(), t.keyType, t.valueType)
150+ clipParquetMapType(parquetType.asGroupType(), t.keyType, t.valueType, caseSensitive )
143151
144152 case t : StructType =>
145- clipParquetGroup(parquetType.asGroupType(), t)
153+ clipParquetGroup(parquetType.asGroupType(), t, caseSensitive )
146154
147155 case _ =>
148156 // UDTs and primitive types are not clipped. For UDTs, a clipped version might not be able
@@ -168,14 +176,15 @@ private[parquet] object ParquetReadSupport {
168176 * of the [[ArrayType ]] should also be a nested type, namely an [[ArrayType ]], a [[MapType ]], or a
169177 * [[StructType ]].
170178 */
171- private def clipParquetListType (parquetList : GroupType , elementType : DataType ): Type = {
179+ private def clipParquetListType (
180+ parquetList : GroupType , elementType : DataType , caseSensitive : Boolean ): Type = {
172181 // Precondition of this method, should only be called for lists with nested element types.
173182 assert(! isPrimitiveCatalystType(elementType))
174183
175184 // Unannotated repeated group should be interpreted as required list of required element, so
176185 // list element type is just the group itself. Clip it.
177186 if (parquetList.getOriginalType == null && parquetList.isRepetition(Repetition .REPEATED )) {
178- clipParquetType(parquetList, elementType)
187+ clipParquetType(parquetList, elementType, caseSensitive )
179188 } else {
180189 assert(
181190 parquetList.getOriginalType == OriginalType .LIST ,
@@ -207,7 +216,7 @@ private[parquet] object ParquetReadSupport {
207216 Types
208217 .buildGroup(parquetList.getRepetition)
209218 .as(OriginalType .LIST )
210- .addField(clipParquetType(repeatedGroup, elementType))
219+ .addField(clipParquetType(repeatedGroup, elementType, caseSensitive ))
211220 .named(parquetList.getName)
212221 } else {
213222 // Otherwise, the repeated field's type is the element type with the repeated field's
@@ -218,7 +227,7 @@ private[parquet] object ParquetReadSupport {
218227 .addField(
219228 Types
220229 .repeatedGroup()
221- .addField(clipParquetType(repeatedGroup.getType(0 ), elementType))
230+ .addField(clipParquetType(repeatedGroup.getType(0 ), elementType, caseSensitive ))
222231 .named(repeatedGroup.getName))
223232 .named(parquetList.getName)
224233 }
@@ -231,7 +240,10 @@ private[parquet] object ParquetReadSupport {
231240 * a [[StructType ]].
232241 */
233242 private def clipParquetMapType (
234- parquetMap : GroupType , keyType : DataType , valueType : DataType ): GroupType = {
243+ parquetMap : GroupType ,
244+ keyType : DataType ,
245+ valueType : DataType ,
246+ caseSensitive : Boolean ): GroupType = {
235247 // Precondition of this method, only handles maps with nested key types or value types.
236248 assert(! isPrimitiveCatalystType(keyType) || ! isPrimitiveCatalystType(valueType))
237249
@@ -243,8 +255,8 @@ private[parquet] object ParquetReadSupport {
243255 Types
244256 .repeatedGroup()
245257 .as(repeatedGroup.getOriginalType)
246- .addField(clipParquetType(parquetKeyType, keyType))
247- .addField(clipParquetType(parquetValueType, valueType))
258+ .addField(clipParquetType(parquetKeyType, keyType, caseSensitive ))
259+ .addField(clipParquetType(parquetValueType, valueType, caseSensitive ))
248260 .named(repeatedGroup.getName)
249261
250262 Types
@@ -262,8 +274,9 @@ private[parquet] object ParquetReadSupport {
262274 * [[MessageType ]]. Because it's legal to construct an empty requested schema for column
263275 * pruning.
264276 */
265- private def clipParquetGroup (parquetRecord : GroupType , structType : StructType ): GroupType = {
266- val clippedParquetFields = clipParquetGroupFields(parquetRecord, structType)
277+ private def clipParquetGroup (
278+ parquetRecord : GroupType , structType : StructType , caseSensitive : Boolean ): GroupType = {
279+ val clippedParquetFields = clipParquetGroupFields(parquetRecord, structType, caseSensitive)
267280 Types
268281 .buildGroup(parquetRecord.getRepetition)
269282 .as(parquetRecord.getOriginalType)
@@ -277,14 +290,35 @@ private[parquet] object ParquetReadSupport {
277290 * @return A list of clipped [[GroupType ]] fields, which can be empty.
278291 */
279292 private def clipParquetGroupFields (
280- parquetRecord : GroupType , structType : StructType ): Seq [Type ] = {
281- val parquetFieldMap = parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap
293+ parquetRecord : GroupType , structType : StructType , caseSensitive : Boolean ): Seq [Type ] = {
282294 val toParquet = new SparkToParquetSchemaConverter (writeLegacyParquetFormat = false )
283- structType.map { f =>
284- parquetFieldMap
285- .get(f.name)
286- .map(clipParquetType(_, f.dataType))
287- .getOrElse(toParquet.convertField(f))
295+ if (caseSensitive) {
296+ val caseSensitiveParquetFieldMap =
297+ parquetRecord.getFields.asScala.map(f => f.getName -> f).toMap
298+ structType.map { f =>
299+ caseSensitiveParquetFieldMap
300+ .get(f.name)
301+ .map(clipParquetType(_, f.dataType, caseSensitive))
302+ .getOrElse(toParquet.convertField(f))
303+ }
304+ } else {
305+ // Do case-insensitive resolution only if in case-insensitive mode
306+ val caseInsensitiveParquetFieldMap =
307+ parquetRecord.getFields.asScala.groupBy(_.getName.toLowerCase(Locale .ROOT ))
308+ structType.map { f =>
309+ caseInsensitiveParquetFieldMap
310+ .get(f.name.toLowerCase(Locale .ROOT ))
311+ .map { parquetTypes =>
312+ if (parquetTypes.size > 1 ) {
313+ // Need to fail if there is ambiguity, i.e. more than one field is matched
314+ val parquetTypesString = parquetTypes.map(_.getName).mkString(" [" , " , " , " ]" )
315+ throw new RuntimeException (s """ Found duplicate field(s) " ${f.name}": """ +
316+ s " $parquetTypesString in case-insensitive mode " )
317+ } else {
318+ clipParquetType(parquetTypes.head, f.dataType, caseSensitive)
319+ }
320+ }.getOrElse(toParquet.convertField(f))
321+ }
288322 }
289323 }
290324
0 commit comments