|
19 | 19 | package org.apache.iceberg.spark.actions;
|
20 | 20 |
|
21 | 21 | import static org.apache.iceberg.MetadataTableType.ALL_MANIFESTS;
|
| 22 | +import static org.apache.iceberg.MetadataTableType.ENTRIES; |
22 | 23 | import static org.apache.spark.sql.functions.col;
|
| 24 | +import static org.apache.spark.sql.functions.first; |
23 | 25 | import static org.apache.spark.sql.functions.lit;
|
| 26 | +import static org.apache.spark.sql.functions.max; |
| 27 | +import static org.apache.spark.sql.functions.sum; |
| 28 | +import static org.apache.spark.sql.functions.when; |
24 | 29 |
|
25 | 30 | import java.util.Collection;
|
26 | 31 | import java.util.Iterator;
|
|
42 | 47 | import org.apache.iceberg.MetadataTableType;
|
43 | 48 | import org.apache.iceberg.PartitionSpec;
|
44 | 49 | import org.apache.iceberg.ReachableFileUtil;
|
| 50 | +import org.apache.iceberg.Snapshot; |
45 | 51 | import org.apache.iceberg.StaticTableOperations;
|
46 | 52 | import org.apache.iceberg.Table;
|
47 | 53 | import org.apache.iceberg.TableMetadata;
|
|
72 | 78 | import org.apache.spark.sql.Dataset;
|
73 | 79 | import org.apache.spark.sql.Row;
|
74 | 80 | import org.apache.spark.sql.SparkSession;
|
| 81 | +import org.apache.spark.sql.functions; |
| 82 | +import org.apache.spark.sql.types.DataTypes; |
75 | 83 | import org.slf4j.Logger;
|
76 | 84 | import org.slf4j.LoggerFactory;
|
77 | 85 |
|
@@ -164,6 +172,66 @@ protected Dataset<FileInfo> contentFileDS(Table table, Set<Long> snapshotIds) {
|
164 | 172 | return manifestBeanDS.flatMap(new ReadManifest(tableBroadcast), FileInfo.ENCODER);
|
165 | 173 | }
|
166 | 174 |
|
| 175 | + protected Dataset<Row> partitionEntryDS(Table table) { |
| 176 | + Dataset<Row> dataset = |
| 177 | + loadMetadataTable(table, ENTRIES) |
| 178 | + .filter(col("status").$less(2)) |
| 179 | + .select( |
| 180 | + col("data_file.spec_id").as("SPEC_ID"), |
| 181 | + col("data_file.partition").as("PARTITION_DATA"), |
| 182 | + when(col("data_file.content").equalTo(0), col("data_file.record_count")) |
| 183 | + .otherwise(lit(0)) |
| 184 | + .as("DATA_RECORD_COUNT"), |
| 185 | + when(col("data_file.content").equalTo(0), lit(1)) |
| 186 | + .otherwise(lit(0)) |
| 187 | + .as("DATA_FILE_COUNT"), |
| 188 | + when(col("data_file.content").equalTo(0), col("data_file.file_size_in_bytes")) |
| 189 | + .otherwise(lit(0)) |
| 190 | + .as("DATA_FILE_SIZE_IN_BYTES"), |
| 191 | + when(col("data_file.content").equalTo(1), col("data_file.record_count")) |
| 192 | + .otherwise(lit(0)) |
| 193 | + .as("POSITION_DELETE_RECORD_COUNT"), |
| 194 | + when(col("data_file.content").equalTo(1), lit(1)) |
| 195 | + .otherwise(lit(0)) |
| 196 | + .as("POSITION_DELETE_FILE_COUNT"), |
| 197 | + when(col("data_file.content").equalTo(2), col("data_file.record_count")) |
| 198 | + .otherwise(lit(0)) |
| 199 | + .as("EQUALITY_DELETE_RECORD_COUNT"), |
| 200 | + when(col("data_file.content").equalTo(2), lit(1)) |
| 201 | + .otherwise(lit(0)) |
| 202 | + .as("EQUALITY_DELETE_FILE_COUNT"), |
| 203 | + functions |
| 204 | + .udf( |
| 205 | + (Long snapshotId) -> lastUpdatedTime(snapshotId, table), DataTypes.LongType) |
| 206 | + .apply(col("snapshot_id")) |
| 207 | + .as("LAST_UPDATED_AT"), |
| 208 | + col("snapshot_id").as("LAST_UPDATED_SNAPSHOT_ID"), |
| 209 | + lit(0) |
| 210 | + .alias("TOTAL_RECORD_COUNT")); // TODO: not sure if this can be computed by this |
| 211 | + // distributed algorithm. This was meant to be |
| 212 | + // effective count after applying deletes. |
| 213 | + |
| 214 | + return dataset |
| 215 | + .groupBy(col("PARTITION_DATA")) |
| 216 | + .agg( |
| 217 | + max(col("LAST_UPDATED_SNAPSHOT_ID")).as("LAST_UPDATED_SNAPSHOT_ID"), |
| 218 | + first(col("LAST_UPDATED_AT")).as("LAST_UPDATED_AT"), |
| 219 | + max(col("SPEC_ID")).as("SPEC_ID"), |
| 220 | + sum(col("DATA_FILE_COUNT")).as("DATA_FILE_COUNT"), |
| 221 | + sum(col("DATA_RECORD_COUNT")).as("DATA_RECORD_COUNT"), |
| 222 | + sum(col("DATA_FILE_SIZE_IN_BYTES")).as("DATA_FILE_SIZE_IN_BYTES"), |
| 223 | + sum(col("POSITION_DELETE_FILE_COUNT")).as("POSITION_DELETE_FILE_COUNT"), |
| 224 | + sum(col("POSITION_DELETE_RECORD_COUNT")).as("POSITION_DELETE_RECORD_COUNT"), |
| 225 | + sum(col("EQUALITY_DELETE_FILE_COUNT")).as("EQUALITY_DELETE_FILE_COUNT"), |
| 226 | + sum(col("EQUALITY_DELETE_RECORD_COUNT")).as("EQUALITY_DELETE_RECORD_COUNT"), |
| 227 | + sum(col("TOTAL_RECORD_COUNT")).as("TOTAL_RECORD_COUNT")); |
| 228 | + } |
| 229 | + |
| 230 | + public static long lastUpdatedTime(long snapshotId, Table table) { |
| 231 | + Snapshot snapshot = table.snapshot(snapshotId); |
| 232 | + return snapshot == null ? 0 : snapshot.timestampMillis(); |
| 233 | + } |
| 234 | + |
167 | 235 | protected Dataset<FileInfo> manifestDS(Table table) {
|
168 | 236 | return manifestDS(table, null);
|
169 | 237 | }
|
|
0 commit comments