API: Add IncrementalChangelogScan

apache · rdblue · Jul 3, 2022 · May 25, 2022 · Jul 1, 2022 · Jun 28, 2022
commit e8b11009b5f80deff0481eda76098ad25d1aac40
diff --git a/api/src/main/java/org/apache/iceberg/AddedRowsScanTask.java b/api/src/main/java/org/apache/iceberg/AddedRowsScanTask.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.util.List;
+
+/**
+ * A scan task for inserted data records generated by adding a data file to the table.
+ * <p>
+ * Note that added files may have matching delete files. If so, such delete files will be included
+ * in this task and must be applied while reading the data file.
+ */
+public interface AddedRowsScanTask extends ChangelogScanTask, ContentScanTask<DataFile> {
+  /**
+   * A list of {@link DeleteFile delete files} to apply when reading the task's data file.
+   *
+   * @return a list of delete files to apply
+   */
+  List<DeleteFile> deletes();
+
+  @Override
+  default ChangelogOperation operation() {
+    return ChangelogOperation.INSERT;
+  }
+}
diff --git a/api/src/main/java/org/apache/iceberg/ChangelogOperation.java b/api/src/main/java/org/apache/iceberg/ChangelogOperation.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+/**
+ * An enum representing possible operations in a changelog.
+ */
+public enum ChangelogOperation {
+  INSERT, DELETE
+}
diff --git a/api/src/main/java/org/apache/iceberg/ChangelogScanTask.java b/api/src/main/java/org/apache/iceberg/ChangelogScanTask.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+/**
+ * A changelog scan task.
+ */
+public interface ChangelogScanTask extends ScanTask {
+  /**
+   * Returns the operation type of records produced by this task (i.e. insert/delete).
+   */
+  ChangelogOperation operation();
+
+  /**
+   * Returns the relative change order in which the changes must be applied.
+   */
+  int changeOrder();
+
+  /**
+   * Returns the snapshot ID in which the changes were committed.
+   */
+  long commitSnapshotId();
+}
diff --git a/api/src/main/java/org/apache/iceberg/DeletedDataFileScanTask.java b/api/src/main/java/org/apache/iceberg/DeletedDataFileScanTask.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.util.List;
+
+/**
+ * A scan task for deleted data records generated by removing a data file from the table.
+ * <p>
+ * Note that all historical delete files added before must be applied while reading the data file to
+ * output only those data records that were live when the data file was removed.
+ */
+public interface DeletedDataFileScanTask extends ChangelogScanTask, ContentScanTask<DataFile> {
+  /**
+   * A list of {@link DeleteFile delete files} to apply when reading the task's data file.
+   *
+   * @return a list of delete files to apply
+   */
+  List<DeleteFile> deletes();
+
+  @Override
+  default ChangelogOperation operation() {
+    return ChangelogOperation.DELETE;
+  }
+}
diff --git a/api/src/main/java/org/apache/iceberg/DeletedRowsScanTask.java b/api/src/main/java/org/apache/iceberg/DeletedRowsScanTask.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+import java.util.List;
+
+/**
+ * A scan task for deleted data records generated by adding delete files to the table.
+ */
+public interface DeletedRowsScanTask extends ChangelogScanTask, ContentScanTask<DataFile> {
+  /**
+   * A list of added {@link DeleteFile delete files} that apply to the task's data file.
+   * Records removed by these delete files should appear as deletes in the changelog.
+   *
+   * @return a list of added delete files
+   */
+  List<DeleteFile> addedDeletes();
+
+  /**
+   * A list of {@link DeleteFile delete files} that existed before and must be applied prior to
+   * determining which records are deleted by delete files in {@link #addedDeletes()}.
+   * Records removed by these delete files should not appear in the changelog.
+   *
+   * @return a list of existing delete files
+   */
+  List<DeleteFile> existingDeletes();
+
+  @Override
+  default ChangelogOperation operation() {
+    return ChangelogOperation.DELETE;
+  }
+}
diff --git a/api/src/main/java/org/apache/iceberg/IncrementalAppendScan.java b/api/src/main/java/org/apache/iceberg/IncrementalAppendScan.java
@@ -23,41 +23,5 @@
 /**
  * API for configuring an incremental table scan for appends only snapshots
  */
-public interface IncrementalAppendScan extends Scan<IncrementalAppendScan, FileScanTask, CombinedScanTask> {
-
-  /**
-   * Refine the incremental scan with the start snapshot inclusive.
-   * <p>
-   * If neither {@link #fromSnapshotInclusive(long)} or {@link #fromSnapshotExclusive(long)} is provided,
-   * start snapshot inclusive is defaulted to the oldest ancestor of the end snapshot.
-   *
-   * @param fromSnapshotId the start snapshot id inclusive
-   * @return an incremental table scan from {@code fromSnapshotId} inclusive
-   * @throws IllegalArgumentException if the start snapshot is not an ancestor
-   * of the end snapshot
-   */
-  IncrementalAppendScan fromSnapshotInclusive(long fromSnapshotId);
-
-  /**
-   * Refine the incremental scan with the start snapshot exclusive.
-   * <p>
-   * If neither {@link #fromSnapshotInclusive(long)} or {@link #fromSnapshotExclusive(long)} is provided,
-   * start snapshot inclusive is defaulted to the oldest ancestor of the end snapshot.
-   *
-   * @param fromSnapshotId the start snapshot id (exclusive)
-   * @return an incremental table scan from {@code fromSnapshotId} exclusive
-   * @throws IllegalArgumentException if the start snapshot is not an ancestor
-   * of the end snapshot
-   */
-  IncrementalAppendScan fromSnapshotExclusive(long fromSnapshotId);
-
-  /**
-   * Refine the incremental scan with the end snapshot inclusive.
-   * <p>
-   * If not provided, end snapshot is defaulted to the current table snapshot.
-   *
-   * @param toSnapshotId the end snapshot id (inclusive)
-   * @return an incremental table scan up to {@code toSnapshotId} inclusive
-   */
-  IncrementalAppendScan toSnapshot(long toSnapshotId);
+public interface IncrementalAppendScan extends IncrementalScan<IncrementalAppendScan, FileScanTask, CombinedScanTask> {
 }
diff --git a/api/src/main/java/org/apache/iceberg/IncrementalChangelogScan.java b/api/src/main/java/org/apache/iceberg/IncrementalChangelogScan.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+/**
+ * API for configuring a scan for table changes.
+ */
+public interface IncrementalChangelogScan
+    extends IncrementalScan<IncrementalChangelogScan, ChangelogScanTask, ScanTaskGroup<ChangelogScanTask>> {
+}
diff --git a/api/src/main/java/org/apache/iceberg/IncrementalScan.java b/api/src/main/java/org/apache/iceberg/IncrementalScan.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg;
+
+/**
+ * API for configuring an incremental scan.
+ */
+public interface IncrementalScan<ThisT, T extends ScanTask, G extends ScanTaskGroup<T>> extends Scan<ThisT, T, G> {
+  /**
+   * Refines this scan with a start snapshot ID (inclusive).
+   * <p>
+   * If the start snapshot is not configured, it is defaulted to the oldest ancestor
+   * of the end snapshot (inclusive).
+   *
+   * @param fromSnapshotId the start snapshot ID (inclusive)
+   * @return this for method chaining
+   * @throws IllegalArgumentException if the start snapshot is not an ancestor of the end snapshot
+   */
+  ThisT fromSnapshotInclusive(long fromSnapshotId);
+
+  /**
+   * Refines this scan with a start snapshot ID (exclusive).
+   * <p>
+   * If the start snapshot is not configured, it is defaulted to the oldest ancestor
+   * of the end snapshot (inclusive).
+   *
+   * @param fromSnapshotId the start snapshot ID (exclusive)
+   * @return this for method chaining
+   * @throws IllegalArgumentException if the start snapshot is not an ancestor of the end snapshot
+   */
+  ThisT fromSnapshotExclusive(long fromSnapshotId);
+
+  /**
+   * Instructs this scan to look for changes up to a particular snapshot (inclusive).
+   * <p>
+   * If the end snapshot is not configured, it is defaulted to the current table snapshot (inclusive).
+   *
+   * @param toSnapshotId the end snapshot ID (inclusive)
+   * @return this for method chaining
+   */
+  ThisT toSnapshot(long toSnapshotId);
+}
diff --git a/api/src/main/java/org/apache/iceberg/Table.java b/api/src/main/java/org/apache/iceberg/Table.java
@@ -64,6 +64,17 @@ default IncrementalAppendScan newIncrementalAppendScan() {
     throw new UnsupportedOperationException("Incremental append scan is not supported");
   }
 
+  /**
+   * Create a new {@link IncrementalChangelogScan} for this table.
+   * <p>
+   * Once a scan is created, it can be refined to project columns and filter data.
+   *
+   * @return an incremental changelog scan
+   */
+  default IncrementalChangelogScan newIncrementalChangelogScan() {
+    throw new UnsupportedOperationException("Incremental changelog scan is not supported");
+  }
+
   /**
    * Return the {@link Schema schema} for this table.
    *