First commit for MapType

AndreSchumacher · AndreSchumacher · commit b539fdeabfe6 · 2014-06-19T17:27:49.000+03:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetConverter.scala
@@ -26,6 +26,7 @@ import org.apache.spark.Logging
 import org.apache.spark.sql.catalyst.types._
 import org.apache.spark.sql.catalyst.expressions.{GenericRow, Row, Attribute}
 import org.apache.spark.sql.parquet.CatalystConverter.FieldType
+import scala.collection.mutable
 
 private[parquet] object CatalystConverter {
   // The type internally used for fields
@@ -55,6 +56,14 @@ private[parquet] object CatalystConverter {
       case StructType(fields: Seq[StructField]) => {
         new CatalystStructConverter(fields, fieldIndex, parent)
       }
+      case MapType(keyType: DataType, valueType: DataType) => {
+        new CatalystMapConverter(
+          Seq(
+            new FieldType("key", keyType, false),
+            new FieldType("value", valueType, true)),
+            fieldIndex,
+            parent)
+      }
       case ctype: NativeType => {
         // note: for some reason matching for StringType fails so use this ugly if instead
         if (ctype == StringType) {
@@ -396,6 +405,67 @@ private[parquet] class CatalystStructConverter(
   override def getCurrentRecord: Row = throw new UnsupportedOperationException
 }
 
-// TODO: add MapConverter
+private[parquet] class CatalystMapConverter(
+    protected[parquet] val schema: Seq[FieldType],
+    override protected[parquet] val index: Int,
+    override protected[parquet] val parent: CatalystConverter)
+  extends GroupConverter with CatalystConverter {
+
+  private val map = new mutable.HashMap[Any, Any]()
+
+  private val keyValueConverter = new GroupConverter with CatalystConverter {
+    private var currentKey: Any = null
+    private var currentValue: Any = null
+    val keyConverter = CatalystConverter.createConverter(schema(0), 0, this)
+    val valueConverter = CatalystConverter.createConverter(schema(1), 1, this)
+
+    override def getConverter(fieldIndex: Int): Converter = if (fieldIndex == 0) keyConverter else valueConverter
+
+    override def end(): Unit = CatalystMapConverter.this.map += currentKey -> currentValue
+
+    override def start(): Unit = {
+      currentKey = null
+      currentValue = null
+    }
+
+    override protected[parquet] val size: Int = 2
+    override protected[parquet] val index: Int = 0
+    override protected[parquet] val parent: CatalystConverter = CatalystMapConverter.this
+
+    override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit = fieldIndex match {
+      case 0 =>
+        currentKey = value
+      case 1 =>
+        currentValue = value
+      case _ =>
+        new RuntimePermission(s"trying to update Map with fieldIndex $fieldIndex")
+    }
+
+    override protected[parquet] def clearBuffer(): Unit = {}
+    override def getCurrentRecord: Row = throw new UnsupportedOperationException
+  }
+
+  override protected[parquet] val size: Int = 1
+
+  override protected[parquet] def clearBuffer(): Unit = {}
+
+  override def start(): Unit = {
+    map.clear()
+  }
+
+  // TODO: think about reusing the buffer
+  override def end(): Unit = {
+    assert(!isRootConverter)
+    parent.updateField(index, map)
+  }
+
+  override def getConverter(fieldIndex: Int): Converter = keyValueConverter
+
+  override def getCurrentRecord: Row = throw new UnsupportedOperationException
+
+  override protected[parquet] def updateField(fieldIndex: Int, value: Any): Unit =
+    throw new UnsupportedOperationException
+}
+
 
 
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTestData.scala
@@ -167,9 +167,32 @@ private[sql] object ParquetTestData {
       |}
     """.stripMargin
 
+  val testNestedSchema4 =
+    """
+      |message TestNested4 {
+        |required int32 x;
+        |optional group data1 {
+          |repeated group map {
+            |required binary key;
+            |optional int32 value;
+          |}
+        |}
+        |required group data2 {
+          |repeated group map {
+            |required int32 key;
+            |optional group value {
+              |required int64 payload1;
+              |optional binary payload2;
+            |}
+          |}
+        |}
+      |}
+    """.stripMargin
+
   val testNestedDir1 = Utils.createTempDir()
   val testNestedDir2 = Utils.createTempDir()
   val testNestedDir3 = Utils.createTempDir()
+  val testNestedDir4 = Utils.createTempDir()
 
   lazy val testNestedData1 = new ParquetRelation(testNestedDir1.toURI.toString)
   lazy val testNestedData2 = new ParquetRelation(testNestedDir2.toURI.toString)
@@ -327,6 +350,37 @@ private[sql] object ParquetTestData {
     writer.close()
   }
 
+  def writeNestedFile4() {
+    testNestedDir4.delete()
+    val path: Path = testNestedDir4
+    val schema: MessageType = MessageTypeParser.parseMessageType(testNestedSchema4)
+
+    val r1 = new SimpleGroup(schema)
+    r1.add(0, 7)
+    val map1 = r1.addGroup(1)
+    val keyValue1 = map1.addGroup(0)
+    keyValue1.add(0, "key1")
+    keyValue1.add(1, 1)
+    val keyValue2 = map1.addGroup(0)
+    keyValue2.add(0, "key2")
+    keyValue2.add(1, 2)
+    val map2 = r1.addGroup(2)
+    val keyValue3 = map2.addGroup(0)
+    keyValue3.add(0, 7)
+    val valueGroup1 = keyValue3.addGroup(1)
+    valueGroup1.add(0, 42.toLong)
+    valueGroup1.add(1, "the answer")
+    val keyValue4 = map2.addGroup(0)
+    keyValue4.add(0, 8)
+    val valueGroup2 = keyValue4.addGroup(1)
+    valueGroup2.add(0, 49.toLong)
+
+    val writeSupport = new TestGroupWriteSupport(schema)
+    val writer = new ParquetWriter[Group](path, writeSupport)
+    writer.write(r1)
+    writer.close()
+  }
+
   def readNestedFile(path: File, schemaString: String): Unit = {
     val configuration = new Configuration()
     val fs: FileSystem = path.getFileSystem(configuration)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/ParquetTypes.scala
@@ -63,13 +63,17 @@ private[parquet] object ParquetTypesConverter {
    * Note that we apply the following conversion rules:
    * <ul>
    *   <li> Primitive types are converter to the corresponding primitive type.</li>
-   *   <li> Group types that have a single field with repetition `REPEATED` or themselves
-   *        have repetition level `REPEATED` are converted to an [[ArrayType]] with the
-   *        corresponding field type (possibly primitive) as element type.</li>
+   *   <li> Group types that have a single field that is itself a group, which has repetition
+   *        level `REPEATED` and two fields (named `key` and `value`), are converted to
+   *        a [[MapType]] with the corresponding key and value (value possibly complex)
+   *        as element type.</li>
    *   <li> Other group types are converted as follows:<ul>
-   *      <li> If they have a single field, they are converted into a [[StructType]] with
+   *      <li> Group types that have a single field with repetition `REPEATED` or themselves
+   *           have repetition level `REPEATED` are converted to an [[ArrayType]] with the
+   *           corresponding field type (possibly primitive) as element type.</li>
+   *      <li> Other groups with a single field are converted into a [[StructType]] with
    *           the corresponding field type.</li>
-   *      <li> If they have more than one field and repetition level `REPEATED` they are
+   *      <li> If groups have more than one field and repetition level `REPEATED` they are
    *           converted into an [[ArrayType]] with the corresponding [[StructType]] as complex
    *           element type.</li>
    *      <li> Otherwise they are converted into a [[StructType]] with the corresponding
@@ -82,16 +86,33 @@ private[parquet] object ParquetTypesConverter {
    * @return The corresponding Catalyst type.
    */
   def toDataType(parquetType: ParquetType): DataType = {
+    def correspondsToMap(groupType: ParquetGroupType): Boolean = {
+      if (groupType.getFieldCount != 1 || groupType.getFields.apply(0).isPrimitive) {
+        false
+      } else {
+        // This mostly follows the convention in ``parquet.schema.ConversionPatterns``
+        val keyValueGroup = groupType.getFields.apply(0).asGroupType()
+        keyValueGroup.getRepetition == Repetition.REPEATED &&
+          keyValueGroup.getName == "map" &&
+          keyValueGroup.getFields.apply(0).getName == "key" &&
+          keyValueGroup.getFields.apply(1).getName == "value"
+      }
+    }
+    def correspondsToArray(groupType: ParquetGroupType): Boolean = {
+      groupType.getFieldCount == 1 &&
+        (groupType.getFields.apply(0).getRepetition == Repetition.REPEATED ||
+          groupType.getRepetition == Repetition.REPEATED)
+    }
+
     if (parquetType.isPrimitive) {
       toPrimitiveDataType(parquetType.asPrimitiveType.getPrimitiveTypeName)
-    }
-    else {
+    } else {
       val groupType = parquetType.asGroupType()
       parquetType.getOriginalType match {
         // if the schema was constructed programmatically there may be hints how to convert
         // it inside the metadata via the OriginalType field
         case ParquetOriginalType.LIST => { // TODO: check enums!
-        val fields = groupType.getFields.map {
+          val fields = groupType.getFields.map {
             field => new StructField(
               field.getName,
               toDataType(field),
@@ -103,16 +124,29 @@ private[parquet] object ParquetTypesConverter {
             new ArrayType(StructType(fields))
           }
         }
+        case ParquetOriginalType.MAP => {
+          assert(
+            !groupType.getFields.apply(0).isPrimitive,
+            "Parquet Map type malformatted: expected nested group for map!")
+          val keyValueGroup = groupType.getFields.apply(0).asGroupType()
+          assert(
+            keyValueGroup.getFieldCount == 2,
+            "Parquet Map type malformatted: nested group should have 2 (key, value) fields!")
+          val keyType = toDataType(keyValueGroup.getFields.apply(0))
+          val valueType = toDataType(keyValueGroup.getFields.apply(1))
+          new MapType(keyType, valueType)
+        }
         case _ => {
-          // everything else nested becomes a Struct, unless it has a single repeated field
-          // in which case it becomes an array (this should correspond to the inverse operation of
-          // parquet.schema.ConversionPatterns.listType)
-          if (groupType.getFieldCount == 1 &&
-            (groupType.getFields.apply(0).getRepetition == Repetition.REPEATED ||
-              groupType.getRepetition == Repetition.REPEATED)) {
+          // Note: the order of these checks is important!
+          if (correspondsToMap(groupType)) { // MapType
+            val keyValueGroup = groupType.getFields.apply(0).asGroupType()
+            val keyType = toDataType(keyValueGroup.getFields.apply(0))
+            val valueType = toDataType(keyValueGroup.getFields.apply(1))
+            new MapType(keyType, valueType)
+          } else if (correspondsToArray(groupType)) { // ArrayType
             val elementType = toDataType(groupType.getFields.apply(0))
             new ArrayType(elementType)
-          } else {
+          } else { // everything else: StructType
             val fields = groupType
               .getFields
               .map(ptype => new StructField(
@@ -164,7 +198,10 @@ private[parquet] object ParquetTypesConverter {
    * <ul>
    *   <li> Primitive types are converted into Parquet's primitive types.</li>
    *   <li> [[org.apache.spark.sql.catalyst.types.StructType]]s are converted
-   *   into Parquet's `GroupType` with the corresponding field types.</li>
+   *        into Parquet's `GroupType` with the corresponding field types.</li>
+   *   <li> [[org.apache.spark.sql.catalyst.types.MapType]]s are converted
+   *        into a nested (2-level) Parquet `GroupType` with two fields: a key type and
+   *        a value type. The nested group has repetition level `REPEATED`.</li>
    *   <li> [[org.apache.spark.sql.catalyst.types.ArrayType]]s are handled as follows:<ul>
    *     <li> If their element is complex, that is of type
    *          [[org.apache.spark.sql.catalyst.types.StructType]], they are converted
@@ -174,18 +211,18 @@ private[parquet] object ParquetTypesConverter {
    *     that is also a list but has only a single field of the type corresponding to
    *     the element type.</li></ul></li>
    * </ul>
-   * Parquet's repetition level is set according to the following rule:
+   * Parquet's repetition level is generally set according to the following rule:
    * <ul>
-   *   <li> If the call to `fromDataType` is recursive inside an enclosing `ArrayType`, then
-   *   the repetition level is set to `REPEATED`.</li>
+   *   <li> If the call to `fromDataType` is recursive inside an enclosing `ArrayType` or
+   *   `MapType`, then the repetition level is set to `REPEATED`.</li>
    *   <li> Otherwise, if the attribute whose type is converted is `nullable`, the Parquet
    *   type gets repetition level `OPTIONAL` and otherwise `REQUIRED`.</li>
    * </ul>
-   * The single expection to this rule is an [[org.apache.spark.sql.catalyst.types.ArrayType]]
+   * The single exception to this rule is an [[org.apache.spark.sql.catalyst.types.ArrayType]]
    * that contains a [[org.apache.spark.sql.catalyst.types.StructType]], whose repetition level
    * is always set to `REPEATED`.
    *
-  @param ctype The type to convert.
+   * @param ctype The type to convert.
    * @param name The name of the [[org.apache.spark.sql.catalyst.expressions.Attribute]]
    *             whose type is converted
    * @param nullable When true indicates that the attribute is nullable
@@ -239,6 +276,13 @@ private[parquet] object ParquetTypesConverter {
           }
           new ParquetGroupType(repetition, name, fields)
         }
+        case MapType(keyType, valueType) => {
+          ConversionPatterns.mapType(
+            repetition,
+            name,
+            fromDataType(keyType, "key", false, inArray = false),
+            fromDataType(valueType, "value", true, inArray = false))
+        }
         case _ => sys.error(s"Unsupported datatype $ctype")
       }
     }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/parquet/ParquetQuerySuite.scala
@@ -74,6 +74,8 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     ParquetTestData.writeFilterFile()
     ParquetTestData.writeNestedFile1()
     ParquetTestData.writeNestedFile2()
+    ParquetTestData.writeNestedFile3()
+    ParquetTestData.writeNestedFile4()
     testRDD = parquetFile(ParquetTestData.testDir.toString)
     testRDD.registerAsTable("testsource")
     parquetFile(ParquetTestData.testFilterDir.toString)
@@ -85,6 +87,8 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     Utils.deleteRecursively(ParquetTestData.testFilterDir)
     Utils.deleteRecursively(ParquetTestData.testNestedDir1)
     Utils.deleteRecursively(ParquetTestData.testNestedDir2)
+    Utils.deleteRecursively(ParquetTestData.testNestedDir3)
+    Utils.deleteRecursively(ParquetTestData.testNestedDir4)
     // here we should also unregister the table??
   }
 
@@ -495,7 +499,6 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
 
   test("nested structs") {
     implicit def anyToRow(value: Any): Row = value.asInstanceOf[Row]
-    ParquetTestData.writeNestedFile3()
     val data = TestSQLContext
       .parquetFile(ParquetTestData.testNestedDir3.toString)
       .toSchemaRDD
@@ -514,6 +517,48 @@ class ParquetQuerySuite extends QueryTest with FunSuiteLike with BeforeAndAfterA
     assert(result3(0)(0) === false)
   }
 
+  test("simple map") {
+    implicit def anyToMap(value: Any) = value.asInstanceOf[collection.mutable.HashMap[String, Int]]
+    val data = TestSQLContext
+      .parquetFile(ParquetTestData.testNestedDir4.toString)
+      .toSchemaRDD
+    data.registerAsTable("mapTable")
+    val result1 = sql("SELECT data1 FROM mapTable").collect()
+    assert(result1.size === 1)
+    assert(result1(0)(0).toMap.getOrElse("key1", 0) === 1)
+    assert(result1(0)(0).toMap.getOrElse("key2", 0) === 2)
+  }
+
+  test("map with struct values") {
+    //implicit def anyToRow(value: Any): Row = value.asInstanceOf[Row]
+    implicit def anyToMap(value: Any) = value.asInstanceOf[collection.mutable.HashMap[Int, Row]]
+    //val data = TestSQLContext
+    //  .parquetFile(ParquetTestData.testNestedDir4.toString)
+    //  .toSchemaRDD
+    val data = TestSQLContext
+      .parquetFile(ParquetTestData.testNestedDir4.toString)
+      .toSchemaRDD
+    data.registerAsTable("mapTable")
+
+    /*ParquetTestData.readNestedFile(
+      ParquetTestData.testNestedDir4,
+      ParquetTestData.testNestedSchema4)
+    val result = TestSQLContext
+      .parquetFile(ParquetTestData.testNestedDir4.toString)
+      .toSchemaRDD
+      .collect()*/
+    val result1 = sql("SELECT data2 FROM mapTable").collect()
+    assert(result1.size === 1)
+    val entry1 = result1(0)(0).getOrElse(7, null)
+    assert(entry1 != null)
+    assert(entry1(0) === 42)
+    assert(entry1(1) === "the answer")
+    val entry2 = result1(0)(0).getOrElse(8, null)
+    assert(entry2 != null)
+    assert(entry2(0) === 49)
+    assert(entry2(1) === null)
+  }
+
   /**
    * Creates an empty SchemaRDD backed by a ParquetRelation.
    *