@@ -64,20 +64,15 @@ private[parquet] object ParquetTypesConverter {
64
64
* <ul>
65
65
* <li> Primitive types are converter to the corresponding primitive type.</li>
66
66
* <li> Group types that have a single field that is itself a group, which has repetition
67
- * level `REPEATED` and two fields (named `key` and `value`), are converted to
68
- * a [[MapType ]] with the corresponding key and value (value possibly complex)
69
- * as element type.</li>
70
- * <li> Other group types are converted as follows:<ul>
71
- * <li> Group types that have a single field with repetition `REPEATED` or themselves
72
- * have repetition level `REPEATED` are converted to an [[ArrayType ]] with the
73
- * corresponding field type (possibly primitive) as element type.</li>
74
- * <li> Other groups with a single field are converted into a [[StructType ]] with
75
- * the corresponding field type.</li>
76
- * <li> If groups have more than one field and repetition level `REPEATED` they are
77
- * converted into an [[ArrayType ]] with the corresponding [[StructType ]] as complex
78
- * element type.</li>
79
- * <li> Otherwise they are converted into a [[StructType ]] with the corresponding
80
- * field types.</li></ul></li>
67
+ * level `REPEATED` are treated as follows:<ul>
68
+ * <li> If the nested group has name `values` and repetition level `REPEATED`, the
69
+ * surrounding group is converted into an [[ArrayType ]] with the
70
+ * corresponding field type (primitive or complex) as element type.</li>
71
+ * <li> If the nested group has name `map`, repetition level `REPEATED` and two fields
72
+ * (named `key` and `value`), the surrounding group is converted into a [[MapType ]]
73
+ * with the corresponding key and value (value possibly complex) types.</li>
74
+ * <li> Other group types are converted into a [[StructType ]] with the corresponding
75
+ * field types.</li></ul></li>
81
76
* </ul>
82
77
* Note that fields are determined to be `nullable` if and only if their Parquet repetition
83
78
* level is not `REQUIRED`.
@@ -93,15 +88,16 @@ private[parquet] object ParquetTypesConverter {
93
88
// This mostly follows the convention in ``parquet.schema.ConversionPatterns``
94
89
val keyValueGroup = groupType.getFields.apply(0 ).asGroupType()
95
90
keyValueGroup.getRepetition == Repetition .REPEATED &&
96
- keyValueGroup.getName == " map" &&
97
- keyValueGroup.getFields.apply(0 ).getName == " key" &&
98
- keyValueGroup.getFields.apply(1 ).getName == " value"
91
+ keyValueGroup.getName == CatalystConverter .MAP_SCHEMA_NAME &&
92
+ keyValueGroup.getFieldCount == 2 &&
93
+ keyValueGroup.getFields.apply(0 ).getName == CatalystConverter .MAP_KEY_SCHEMA_NAME &&
94
+ keyValueGroup.getFields.apply(1 ).getName == CatalystConverter .MAP_VALUE_SCHEMA_NAME
99
95
}
100
96
}
101
97
def correspondsToArray (groupType : ParquetGroupType ): Boolean = {
102
98
groupType.getFieldCount == 1 &&
103
- ( groupType.getFields.apply (0 ).getRepetition == Repetition . REPEATED ||
104
- groupType.getRepetition == Repetition .REPEATED )
99
+ groupType.getFieldName (0 ) == CatalystConverter . ARRAY_ELEMENTS_SCHEMA_NAME &&
100
+ groupType.getFields.apply( 0 ). getRepetition == Repetition .REPEATED
105
101
}
106
102
107
103
if (parquetType.isPrimitive) {
@@ -112,17 +108,9 @@ private[parquet] object ParquetTypesConverter {
112
108
// if the schema was constructed programmatically there may be hints how to convert
113
109
// it inside the metadata via the OriginalType field
114
110
case ParquetOriginalType .LIST => { // TODO: check enums!
115
- val fields = groupType.getFields.map {
116
- field => new StructField (
117
- field.getName,
118
- toDataType(field),
119
- field.getRepetition != Repetition .REQUIRED )
120
- }
121
- if (fields.size == 1 ) {
122
- new ArrayType (fields.apply(0 ).dataType)
123
- } else {
124
- new ArrayType (StructType (fields))
125
- }
111
+ assert(groupType.getFieldCount == 1 )
112
+ val field = groupType.getFields.apply(0 )
113
+ new ArrayType (toDataType(field))
126
114
}
127
115
case ParquetOriginalType .MAP => {
128
116
assert(
@@ -153,16 +141,7 @@ private[parquet] object ParquetTypesConverter {
153
141
ptype.getName,
154
142
toDataType(ptype),
155
143
ptype.getRepetition != Repetition .REQUIRED ))
156
-
157
- if (groupType.getFieldCount == 1 ) {
158
- new StructType (fields)
159
- } else {
160
- if (parquetType.getRepetition == Repetition .REPEATED ) {
161
- new ArrayType (StructType (fields))
162
- } else {
163
- new StructType (fields)
164
- }
165
- }
144
+ new StructType (fields)
166
145
}
167
146
}
168
147
}
@@ -199,17 +178,17 @@ private[parquet] object ParquetTypesConverter {
199
178
* <li> Primitive types are converted into Parquet's primitive types.</li>
200
179
* <li> [[org.apache.spark.sql.catalyst.types.StructType ]]s are converted
201
180
* into Parquet's `GroupType` with the corresponding field types.</li>
181
+ * <li> [[org.apache.spark.sql.catalyst.types.ArrayType ]]s are converterd
182
+ * into a 2-level nested group, where the outer group has the inner
183
+ * group as sole field. The inner group has name `values` and
184
+ * repetition level `REPEATED` and has the element type of
185
+ * the array as schema. We use Parquet's `ConversionPatterns` for this
186
+ * purpose.</li>
202
187
* <li> [[org.apache.spark.sql.catalyst.types.MapType ]]s are converted
203
- * into a nested (2-level) Parquet `GroupType` with two fields: a key type and
204
- * a value type. The nested group has repetition level `REPEATED`.</li>
205
- * <li> [[org.apache.spark.sql.catalyst.types.ArrayType ]]s are handled as follows:<ul>
206
- * <li> If their element is complex, that is of type
207
- * [[org.apache.spark.sql.catalyst.types.StructType ]], they are converted
208
- * into a `GroupType` with the corresponding field types of the struct and
209
- * original type of the `GroupType` is set to `LIST`.</li>
210
- * <li> Otherwise, that is they contain a primitive they are converted into a `GroupType`
211
- * that is also a list but has only a single field of the type corresponding to
212
- * the element type.</li></ul></li>
188
+ * into a nested (2-level) Parquet `GroupType` with two fields: a key
189
+ * type and a value type. The nested group has repetition level
190
+ * `REPEATED` and name `map`. We use Parquet's `ConversionPatterns`
191
+ * for this purpose</li>
213
192
* </ul>
214
193
* Parquet's repetition level is generally set according to the following rule:
215
194
* <ul>
@@ -218,11 +197,8 @@ private[parquet] object ParquetTypesConverter {
218
197
* <li> Otherwise, if the attribute whose type is converted is `nullable`, the Parquet
219
198
* type gets repetition level `OPTIONAL` and otherwise `REQUIRED`.</li>
220
199
* </ul>
221
- * The single exception to this rule is an [[org.apache.spark.sql.catalyst.types.ArrayType ]]
222
- * that contains a [[org.apache.spark.sql.catalyst.types.StructType ]], whose repetition level
223
- * is always set to `REPEATED`.
224
200
*
225
- * @param ctype The type to convert.
201
+ *@param ctype The type to convert
226
202
* @param name The name of the [[org.apache.spark.sql.catalyst.expressions.Attribute ]]
227
203
* whose type is converted
228
204
* @param nullable When true indicates that the attribute is nullable
@@ -245,43 +221,38 @@ private[parquet] object ParquetTypesConverter {
245
221
new ParquetPrimitiveType (repetition, primitiveType.get, name)
246
222
} else {
247
223
ctype match {
248
- case ArrayType (elementType : DataType ) => {
249
- elementType match {
250
- case StructType (fields) => { // first case: array of structs
251
- val parquetFieldTypes = fields.map(
252
- f => fromDataType(f.dataType, f.name, f.nullable, inArray = false ))
253
- assert(
254
- fields.size > 1 ,
255
- " Found struct inside array with a single field.. error parsing Catalyst schema" )
256
- new ParquetGroupType (
257
- Repetition .REPEATED ,
258
- name,
259
- ParquetOriginalType .LIST ,
260
- parquetFieldTypes)
261
- }
262
- case _ => { // second case: array of primitive types
263
- val parquetElementType = fromDataType(
264
- elementType,
265
- CatalystConverter .ARRAY_ELEMENTS_SCHEMA_NAME ,
266
- nullable = false ,
267
- inArray = true )
268
- ConversionPatterns .listType(repetition, name, parquetElementType)
269
- }
270
- }
224
+ case ArrayType (elementType) => {
225
+ val parquetElementType = fromDataType(
226
+ elementType,
227
+ CatalystConverter .ARRAY_ELEMENTS_SCHEMA_NAME ,
228
+ nullable = false ,
229
+ inArray = true )
230
+ ConversionPatterns .listType(repetition, name, parquetElementType)
271
231
}
272
- // TODO: test structs inside arrays
273
232
case StructType (structFields) => {
274
233
val fields = structFields.map {
275
234
field => fromDataType(field.dataType, field.name, field.nullable, inArray = false )
276
235
}
277
236
new ParquetGroupType (repetition, name, fields)
278
237
}
279
238
case MapType (keyType, valueType) => {
239
+ val parquetKeyType =
240
+ fromDataType(
241
+ keyType,
242
+ CatalystConverter .MAP_KEY_SCHEMA_NAME ,
243
+ false ,
244
+ inArray = false )
245
+ val parquetValueType =
246
+ fromDataType(
247
+ valueType,
248
+ CatalystConverter .MAP_VALUE_SCHEMA_NAME ,
249
+ true ,
250
+ inArray = false )
280
251
ConversionPatterns .mapType(
281
252
repetition,
282
253
name,
283
- fromDataType(keyType, " key " , false , inArray = false ) ,
284
- fromDataType(valueType, " value " , true , inArray = false ) )
254
+ parquetKeyType ,
255
+ parquetValueType )
285
256
}
286
257
case _ => sys.error(s " Unsupported datatype $ctype" )
287
258
}
0 commit comments