Support structured streaming read for Iceberg

XuQianJin-Stars · XuQianJin-Stars · commit afbb6d9b7eee · 2021-02-25T20:08:26.000+08:00
diff --git a/core/src/main/java/org/apache/iceberg/MicroBatches.java b/core/src/main/java/org/apache/iceberg/MicroBatches.java
@@ -153,7 +153,7 @@ private static List<Pair<ManifestFile, Integer>> indexManifests(List<ManifestFil
      * startFileIndex.
      */
     private static List<Pair<ManifestFile, Integer>> skipManifests(List<Pair<ManifestFile, Integer>> indexedManifests,
-        long startFileIndex) {
+                                                                   long startFileIndex) {
       if (startFileIndex == 0) {
         return indexedManifests;
       }
diff --git a/core/src/main/java/org/apache/iceberg/util/SnapshotUtil.java b/core/src/main/java/org/apache/iceberg/util/SnapshotUtil.java
@@ -20,7 +20,6 @@
 package org.apache.iceberg.util;
 
 import java.util.List;
-import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.function.Function;
 import org.apache.iceberg.DataFile;
 import org.apache.iceberg.Snapshot;
@@ -70,20 +69,8 @@ public static List<Long> currentAncestors(Table table) {
    * This method assumes that fromSnapshotId is an ancestor of toSnapshotId.
    */
   public static List<Long> snapshotIdsBetween(Table table, long fromSnapshotId, long toSnapshotId) {
-    AtomicBoolean isAncestor = new AtomicBoolean(false);
     List<Long> snapshotIds = Lists.newArrayList(ancestorIds(table.snapshot(toSnapshotId),
-        snapshotId -> {
-          if (snapshotId == fromSnapshotId) {
-            isAncestor.set(true);
-            return null;
-          } else {
-            return table.snapshot(snapshotId);
-          }
-        }));
-    if (!isAncestor.get()) {
-      throw new IllegalStateException(fromSnapshotId + " is not an ancestor of " + toSnapshotId);
-    }
-
+        snapshotId -> snapshotId != fromSnapshotId ? table.snapshot(snapshotId) : null));
     return snapshotIds;
   }
 
diff --git a/spark2/src/main/java/org/apache/iceberg/spark/source/StreamingReader.java b/spark2/src/main/java/org/apache/iceberg/spark/source/StreamingReader.java
@@ -167,7 +167,7 @@ public void stop() {
 
   @Override
   public boolean enableBatchRead() {
-    return readUsingBatch == null ? false : readUsingBatch;
+    return readUsingBatch != null && readUsingBatch;
   }
 
   @Override
@@ -215,6 +215,12 @@ protected List<CombinedScanTask> tasks() {
     return combinedScanTasks;
   }
 
+  /**
+   * Used to calculate start offset. If the startSnapshotId has a value, start the construction
+   * from the specified snapshot, otherwise, start the construction from the beginning.
+   *
+   * @return The start offset to scan from.
+   */
   private StreamingOffset calculateStartingOffset() {
     StreamingOffset startingOffset;
     if (startSnapshotId != null) {
@@ -232,6 +238,12 @@ private StreamingOffset calculateStartingOffset() {
     return startingOffset;
   }
 
+  /**
+   * Used to calculate end offset.
+   *
+   * @param start The start offset to scan from
+   * @return The end offset to scan to
+   */
   private StreamingOffset calculateEndOffset(StreamingOffset start) {
     if (start.equals(StreamingOffset.START_OFFSET)) {
       return StreamingOffset.START_OFFSET;
@@ -255,6 +267,13 @@ private StreamingOffset calculateEndOffset(StreamingOffset start) {
     }
   }
 
+  /**
+   * Streaming Read control is performed by changing the startOffset and maxSize.
+   *
+   * @param startOffset The start offset to scan from
+   * @param maxSize     The maximum size of Bytes can calculate how many batches
+   * @return MicroBatch of list
+   */
   @VisibleForTesting
   @SuppressWarnings("checkstyle:HiddenField")
   List<MicroBatch> getChangesWithRateLimit(StreamingOffset startOffset, long maxSize) {
@@ -335,9 +354,11 @@ private boolean shouldGenerateFromStartOffset(StreamingOffset startOffset) {
   }
 
   private static void assertNoOverwrite(Snapshot snapshot) {
-    if (snapshot.operation().equals(DataOperations.OVERWRITE)) {
+    if (snapshot.operation().equals(DataOperations.OVERWRITE) ||
+        snapshot.operation().equals(DataOperations.REPLACE) ||
+        snapshot.operation().equals(DataOperations.DELETE)) {
       throw new UnsupportedOperationException(String.format("Found %s operation, cannot support incremental data for " +
-          "snapshot %d", DataOperations.OVERWRITE, snapshot.snapshotId()));
+          "snapshot %d", snapshot.operation(), snapshot.snapshotId()));
     }
   }
 
diff --git a/spark2/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead.java b/spark2/src/test/java/org/apache/iceberg/spark/source/TestStructuredStreamingRead.java
@@ -36,7 +36,6 @@
 import org.apache.iceberg.Table;
 import org.apache.iceberg.hadoop.HadoopTables;
 import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
-import org.apache.iceberg.relocated.com.google.common.collect.Iterables;
 import org.apache.iceberg.relocated.com.google.common.collect.Lists;
 import org.apache.iceberg.types.Types;
 import org.apache.iceberg.util.SnapshotUtil;

Original file line number	Diff line number	Diff line change
`@@ -153,7 +153,7 @@ private static List<Pair<ManifestFile, Integer>> indexManifests(List<ManifestFil`
`153`	`153`	`* startFileIndex.`
`154`	`154`	`*/`
`155`	`155`	`private static List<Pair<ManifestFile, Integer>> skipManifests(List<Pair<ManifestFile, Integer>> indexedManifests,`
`156`		`- long startFileIndex) {`
	`156`	`+ long startFileIndex) {`
`157`	`157`	`if (startFileIndex == 0) {`
`158`	`158`	`return indexedManifests;`
`159`	`159`	`}`