Add CTAS tests.

rdblue · rdblue · commit d2bc1da8117d · 2020-07-01T17:33:12.000-07:00
diff --git a/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java b/core/src/main/java/org/apache/iceberg/BaseMetastoreCatalog.java
@@ -110,9 +110,17 @@ public Transaction newReplaceTableTransaction(
       throw new NoSuchTableException("No such table: " + identifier);
     }
 
-    String baseLocation = location != null ? location : defaultWarehouseLocation(identifier);
-    Map<String, String> tableProperties = properties != null ? properties : Maps.newHashMap();
-    TableMetadata metadata = TableMetadata.newTableMetadata(schema, spec, baseLocation, tableProperties);
+    TableMetadata metadata;
+    if (ops.current() != null) {
+      String baseLocation = location != null ? location : ops.current().location();
+      Map<String, String> tableProperties = properties != null ? properties : Maps.newHashMap();
+      metadata = ops.current().buildReplacement(schema, spec, baseLocation, tableProperties);
+    } else {
+      String baseLocation = location != null ? location : defaultWarehouseLocation(identifier);
+      Map<String, String> tableProperties = properties != null ? properties : Maps.newHashMap();
+      metadata = TableMetadata.newTableMetadata(schema, spec, baseLocation, tableProperties);
+    }
+
     if (orCreate) {
       return Transactions.createOrReplaceTableTransaction(identifier.toString(), ops, metadata);
     } else {
diff --git a/core/src/main/java/org/apache/iceberg/TableMetadata.java b/core/src/main/java/org/apache/iceberg/TableMetadata.java
@@ -566,7 +566,7 @@ public TableMetadata removeSnapshotLogEntries(Set<Long> snapshotIds) {
 
   // The caller is responsible to pass a updatedPartitionSpec with correct partition field IDs
   public TableMetadata buildReplacement(Schema updatedSchema, PartitionSpec updatedPartitionSpec,
-                                 Map<String, String> updatedProperties) {
+                                        String newLocation, Map<String, String> updatedProperties) {
     ValidationException.check(formatVersion > 1 || PartitionSpec.hasSequentialIds(updatedPartitionSpec),
         "Spec does not use sequential IDs that are required in v1: %s", updatedPartitionSpec);
 
@@ -602,7 +602,7 @@ public TableMetadata buildReplacement(Schema updatedSchema, PartitionSpec update
     newProperties.putAll(this.properties);
     newProperties.putAll(updatedProperties);
 
-    return new TableMetadata(null, formatVersion, uuid, location,
+    return new TableMetadata(null, formatVersion, uuid, newLocation,
         lastSequenceNumber, System.currentTimeMillis(), nextLastColumnId.get(), freshSchema,
         specId, builder.build(), ImmutableMap.copyOf(newProperties),
         -1, snapshots, ImmutableList.of(), addPreviousFile(file, lastUpdatedMillis, newProperties));
diff --git a/core/src/test/java/org/apache/iceberg/TestTables.java b/core/src/test/java/org/apache/iceberg/TestTables.java
@@ -78,7 +78,7 @@ public static Transaction beginReplace(File temp, String name, Schema schema, Pa
 
     TableMetadata metadata;
     if (current != null) {
-      metadata = current.buildReplacement(schema, spec, properties);
+      metadata = current.buildReplacement(schema, spec, current.location(), properties);
       return Transactions.replaceTableTransaction(name, ops, metadata);
     } else {
       metadata = newTableMetadata(schema, spec, temp.toString(), properties);
diff --git a/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java b/spark/src/test/java/org/apache/iceberg/spark/SparkTestBase.java
@@ -101,6 +101,7 @@ protected void assertEquals(String context, List<Object[]> expectedRows, List<Ob
     for (int row = 0; row < expectedRows.size(); row += 1) {
       Object[] expected = expectedRows.get(row);
       Object[] actual = actualRows.get(row);
+      Assert.assertEquals("Number of columns should match", expected.length, actual.length);
       for (int col = 0; col < actualRows.get(row).length; col += 1) {
         if (expected[col] != ANY) {
           Assert.assertEquals(context + ": row " + row + " col " + col + " contents should match",
diff --git a/spark3/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java b/spark3/src/test/java/org/apache/iceberg/spark/SparkCatalogTestBase.java
@@ -97,4 +97,8 @@ public SparkCatalogTestBase(String catalogName, String implementation, Map<Strin
 
     this.tableName = (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default.table";
   }
+
+  protected String tableName(String name) {
+    return (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default." + name;
+  }
 }
diff --git a/spark3/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java b/spark3/src/test/java/org/apache/iceberg/spark/sql/TestCreateTableAsSelect.java
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.spark.sql;
+
+import com.google.common.collect.Iterables;
+import java.util.Map;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.spark.SparkCatalogTestBase;
+import org.apache.iceberg.types.Types;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestCreateTableAsSelect extends SparkCatalogTestBase {
+
+  private final String sourceName;
+
+  public TestCreateTableAsSelect(String catalogName, String implementation, Map<String, String> config) {
+    super(catalogName, implementation, config);
+    this.sourceName = tableName("source");
+
+    sql("CREATE TABLE IF NOT EXISTS %s (id bigint NOT NULL, data string) " +
+        "USING iceberg PARTITIONED BY (truncate(id, 3))", sourceName);
+    sql("INSERT INTO %s VALUES (1, 'a'), (2, 'b'), (3, 'c')", sourceName);
+  }
+
+  @After
+  public void removeTables() {
+    sql("DROP TABLE IF EXISTS %s", tableName);
+  }
+
+  @Test
+  public void testUnpartitionedCTAS() {
+    sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", tableName, sourceName);
+
+    Schema expectedSchema = new Schema(
+        Types.NestedField.optional(1, "id", Types.LongType.get()),
+        Types.NestedField.optional(2, "data", Types.StringType.get())
+    );
+
+    Table ctasTable = validationCatalog.loadTable(tableIdent);
+
+    Assert.assertEquals("Should have expected nullable schema",
+        expectedSchema.asStruct(), ctasTable.schema().asStruct());
+    Assert.assertEquals("Should be an unpartitioned table",
+        0, ctasTable.spec().fields().size());
+    assertEquals("Should have rows matching the source table",
+        sql("SELECT * FROM %s ORDER BY id", sourceName),
+        sql("SELECT * FROM %s ORDER BY id", tableName));
+  }
+
+  @Test
+  public void testPartitionedCTAS() {
+    sql("CREATE TABLE %s USING iceberg PARTITIONED BY (id) AS SELECT * FROM %s ORDER BY id", tableName, sourceName);
+
+    Schema expectedSchema = new Schema(
+        Types.NestedField.optional(1, "id", Types.LongType.get()),
+        Types.NestedField.optional(2, "data", Types.StringType.get())
+    );
+
+    PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema)
+        .identity("id")
+        .build();
+
+    Table ctasTable = validationCatalog.loadTable(tableIdent);
+
+    Assert.assertEquals("Should have expected nullable schema",
+        expectedSchema.asStruct(), ctasTable.schema().asStruct());
+    Assert.assertEquals("Should be partitioned by id",
+        expectedSpec, ctasTable.spec());
+    assertEquals("Should have rows matching the source table",
+        sql("SELECT * FROM %s ORDER BY id", sourceName),
+        sql("SELECT * FROM %s ORDER BY id", tableName));
+  }
+
+  @Test
+  public void testRTAS() {
+    sql("CREATE TABLE %s USING iceberg AS SELECT * FROM %s", tableName, sourceName);
+
+    assertEquals("Should have rows matching the source table",
+        sql("SELECT * FROM %s ORDER BY id", sourceName),
+        sql("SELECT * FROM %s ORDER BY id", tableName));
+
+    sql("REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " +
+        "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " +
+        "FROM %s ORDER BY 3, 1", tableName, sourceName);
+
+    // spark_catalog does not use an atomic replace, so the table history and old spec is dropped
+    // the other catalogs do use atomic replace, so the spec id is incremented
+    boolean isAtomic = !"spark_catalog".equals(catalogName);
+
+    Schema expectedSchema = new Schema(
+        Types.NestedField.optional(1, "id", Types.LongType.get()),
+        Types.NestedField.optional(2, "data", Types.StringType.get()),
+        Types.NestedField.optional(3, "part", Types.StringType.get())
+    );
+
+    int specId = isAtomic ? 1 : 0;
+    PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema)
+        .identity("part")
+        .withSpecId(specId)
+        .build();
+
+    Table rtasTable = validationCatalog.loadTable(tableIdent);
+
+    // the replacement table has a different schema and partition spec than the original
+    Assert.assertEquals("Should have expected nullable schema",
+        expectedSchema.asStruct(), rtasTable.schema().asStruct());
+    Assert.assertEquals("Should be partitioned by part",
+        expectedSpec, rtasTable.spec());
+
+    assertEquals("Should have rows matching the source table",
+        sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " +
+            "FROM %s ORDER BY id", sourceName),
+        sql("SELECT * FROM %s ORDER BY id", tableName));
+
+    Assert.assertEquals("Table should have expected snapshots",
+        isAtomic ? 2 : 1, Iterables.size(rtasTable.snapshots()));
+  }
+
+  @Test
+  public void testCreateRTAS() {
+    sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " +
+        "SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " +
+        "FROM %s ORDER BY 3, 1", tableName, sourceName);
+
+    assertEquals("Should have rows matching the source table",
+        sql("SELECT id, data, CASE WHEN (id %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " +
+            "FROM %s ORDER BY id", sourceName),
+        sql("SELECT * FROM %s ORDER BY id", tableName));
+
+    sql("CREATE OR REPLACE TABLE %s USING iceberg PARTITIONED BY (part) AS " +
+        "SELECT 2 * id as id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " +
+        "FROM %s ORDER BY 3, 1", tableName, sourceName);
+
+    // spark_catalog does not use an atomic replace, so the table history is dropped
+    boolean isAtomic = !"spark_catalog".equals(catalogName);
+
+    Schema expectedSchema = new Schema(
+        Types.NestedField.optional(1, "id", Types.LongType.get()),
+        Types.NestedField.optional(2, "data", Types.StringType.get()),
+        Types.NestedField.optional(3, "part", Types.StringType.get())
+    );
+
+    PartitionSpec expectedSpec = PartitionSpec.builderFor(expectedSchema)
+        .identity("part")
+        .withSpecId(0) // the spec is identical and should be reused
+        .build();
+
+    Table rtasTable = validationCatalog.loadTable(tableIdent);
+
+    // the replacement table has a different schema and partition spec than the original
+    Assert.assertEquals("Should have expected nullable schema",
+        expectedSchema.asStruct(), rtasTable.schema().asStruct());
+    Assert.assertEquals("Should be partitioned by part",
+        expectedSpec, rtasTable.spec());
+
+    assertEquals("Should have rows matching the source table",
+        sql("SELECT 2 * id, data, CASE WHEN ((2 * id) %% 2) = 0 THEN 'even' ELSE 'odd' END AS part " +
+            "FROM %s ORDER BY id", sourceName),
+        sql("SELECT * FROM %s ORDER BY id", tableName));
+
+    Assert.assertEquals("Table should have expected snapshots",
+        isAtomic ? 2 : 1, Iterables.size(rtasTable.snapshots()));
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -97,4 +97,8 @@ public SparkCatalogTestBase(String catalogName, String implementation, Map<Strin`
`97`	`97`
`98`	`98`	`this.tableName = (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default.table";`
`99`	`99`	`}`
	`100`	`+`
	`101`	`+ protected String tableName(String name) {`
	`102`	`+ return (catalogName.equals("spark_catalog") ? "" : catalogName + ".") + "default." + name;`
	`103`	`+ }`
`100`	`104`	`}`