Skip to content

Part 2: Integrate Scan Planning to Core #13400

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions core/src/main/java/org/apache/iceberg/ContentFileParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -134,29 +134,31 @@ static void toJson(ContentFile<?> contentFile, PartitionSpec spec, JsonGenerator
generator.writeEndObject();
}

static ContentFile<?> fromJson(JsonNode jsonNode, PartitionSpec spec) {
static ContentFile<?> fromJson(JsonNode jsonNode, Map<Integer, PartitionSpec> specsById) {
Preconditions.checkArgument(jsonNode != null, "Invalid JSON node for content file: null");
Preconditions.checkArgument(
jsonNode.isObject(), "Invalid JSON node for content file: non-object (%s)", jsonNode);
Preconditions.checkArgument(spec != null, "Invalid partition spec: null");

Preconditions.checkArgument(specsById != null, "Invalid partition spec: null");
int specId = JsonUtil.getInt(SPEC_ID, jsonNode);
FileContent fileContent = FileContent.valueOf(JsonUtil.getString(CONTENT, jsonNode));
String filePath = JsonUtil.getString(FILE_PATH, jsonNode);
FileFormat fileFormat = FileFormat.fromString(JsonUtil.getString(FILE_FORMAT, jsonNode));

PartitionData partitionData = null;
if (jsonNode.has(PARTITION)) {
partitionData = new PartitionData(spec.partitionType());
partitionData = new PartitionData(specsById.get(specId).partitionType());
StructLike structLike =
(StructLike) SingleValueParser.fromJson(spec.partitionType(), jsonNode.get(PARTITION));
(StructLike)
SingleValueParser.fromJson(
specsById.get(specId).partitionType(), jsonNode.get(PARTITION));
Preconditions.checkState(
partitionData.size() == structLike.size(),
"Invalid partition data size: expected = %s, actual = %s",
partitionData.size(),
structLike.size());
for (int pos = 0; pos < partitionData.size(); ++pos) {
Class<?> javaClass = spec.partitionType().fields().get(pos).type().typeId().javaClass();
Class<?> javaClass =
specsById.get(specId).partitionType().fields().get(pos).type().typeId().javaClass();
partitionData.set(pos, structLike.get(pos, javaClass));
}
}
Expand Down
4 changes: 3 additions & 1 deletion core/src/main/java/org/apache/iceberg/DataTaskParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.databind.JsonNode;
import java.io.IOException;
import java.util.Map;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.util.JsonUtil;

Expand Down Expand Up @@ -64,7 +65,8 @@ static StaticDataTask fromJson(JsonNode jsonNode) {
DataFile metadataFile =
(DataFile)
ContentFileParser.fromJson(
JsonUtil.get(METADATA_FILE, jsonNode), PartitionSpec.unpartitioned());
JsonUtil.get(METADATA_FILE, jsonNode),
Map.of(PartitionSpec.unpartitioned().specId(), PartitionSpec.unpartitioned()));

JsonNode rowsArray = JsonUtil.get(ROWS, jsonNode);
Preconditions.checkArgument(
Expand Down
8 changes: 6 additions & 2 deletions core/src/main/java/org/apache/iceberg/FileScanTaskParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.databind.JsonNode;
import java.io.IOException;
import java.util.Map;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.ExpressionParser;
import org.apache.iceberg.expressions.Expressions;
Expand Down Expand Up @@ -86,7 +87,9 @@ static FileScanTask fromJson(JsonNode jsonNode, boolean caseSensitive) {

DataFile dataFile = null;
if (jsonNode.has(DATA_FILE)) {
dataFile = (DataFile) ContentFileParser.fromJson(jsonNode.get(DATA_FILE), spec);
dataFile =
(DataFile)
ContentFileParser.fromJson(jsonNode.get(DATA_FILE), Map.of(spec.specId(), spec));
}

long start = JsonUtil.getLong(START, jsonNode);
Expand All @@ -102,7 +105,8 @@ static FileScanTask fromJson(JsonNode jsonNode, boolean caseSensitive) {
// parse the schema array
ImmutableList.Builder<DeleteFile> builder = ImmutableList.builder();
for (JsonNode deleteFileNode : deletesArray) {
DeleteFile deleteFile = (DeleteFile) ContentFileParser.fromJson(deleteFileNode, spec);
DeleteFile deleteFile =
(DeleteFile) ContentFileParser.fromJson(deleteFileNode, Map.of(spec.specId(), spec));
builder.add(deleteFile);
}

Expand Down
100 changes: 100 additions & 0 deletions core/src/main/java/org/apache/iceberg/RESTFileScanTaskParser.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;

import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.databind.JsonNode;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.iceberg.expressions.Expression;
import org.apache.iceberg.expressions.ExpressionParser;
import org.apache.iceberg.expressions.ResidualEvaluator;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
import org.apache.iceberg.util.JsonUtil;

public class RESTFileScanTaskParser {
private static final String DATA_FILE = "data-file";
private static final String DELETE_FILE_REFERENCES = "delete-file-references";
private static final String RESIDUAL_FILTER = "residual-filter";

private RESTFileScanTaskParser() {}

public static void toJson(
FileScanTask fileScanTask,
Set<Integer> deleteFileReferences,
PartitionSpec partitionSpec,
JsonGenerator generator)
throws IOException {
Preconditions.checkArgument(fileScanTask != null, "Invalid file scan task: null");
Preconditions.checkArgument(generator != null, "Invalid JSON generator: null");

generator.writeStartObject();
generator.writeFieldName(DATA_FILE);
ContentFileParser.toJson(fileScanTask.file(), partitionSpec, generator);
if (deleteFileReferences != null) {
JsonUtil.writeIntegerArray(DELETE_FILE_REFERENCES, deleteFileReferences, generator);
}

if (fileScanTask.residual() != null) {
generator.writeFieldName(RESIDUAL_FILTER);
ExpressionParser.toJson(fileScanTask.residual(), generator);
}
generator.writeEndObject();
}

public static FileScanTask fromJson(
JsonNode jsonNode,
List<DeleteFile> allDeleteFiles,
Map<Integer, PartitionSpec> specsById,
boolean isCaseSensitive) {
Preconditions.checkArgument(jsonNode != null, "Invalid JSON node for file scan task: null");
Preconditions.checkArgument(
jsonNode.isObject(), "Invalid JSON node for file scan task: non-object (%s)", jsonNode);

DataFile dataFile =
(DataFile) ContentFileParser.fromJson(JsonUtil.get(DATA_FILE, jsonNode), specsById);
int specId = dataFile.specId();

DeleteFile[] deleteFiles = null;
Set<Integer> deleteFileReferences = Sets.newHashSet();
if (jsonNode.has(DELETE_FILE_REFERENCES)) {
deleteFileReferences.addAll(JsonUtil.getIntegerList(DELETE_FILE_REFERENCES, jsonNode));
ImmutableList.Builder<GenericDeleteFile> builder = ImmutableList.builder();
deleteFileReferences.forEach(
delIdx -> builder.add((GenericDeleteFile) allDeleteFiles.get(delIdx)));
deleteFiles = builder.build().toArray(new GenericDeleteFile[0]);
}

Expression filter = null;
if (jsonNode.has(RESIDUAL_FILTER)) {
filter = ExpressionParser.fromJson(jsonNode.get(RESIDUAL_FILTER));
}

String schemaString = SchemaParser.toJson(specsById.get(specId).schema());
String specString = PartitionSpecParser.toJson(specsById.get(specId));
ResidualEvaluator boundResidual =
ResidualEvaluator.of(specsById.get(specId), filter, isCaseSensitive);

return new BaseFileScanTask(dataFile, deleteFiles, schemaString, specString, boundResidual);
}
}
76 changes: 76 additions & 0 deletions core/src/main/java/org/apache/iceberg/RestTable.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iceberg;

import java.util.Map;
import java.util.function.Supplier;
import org.apache.iceberg.catalog.TableIdentifier;
import org.apache.iceberg.metrics.MetricsReporter;
import org.apache.iceberg.rest.RESTClient;
import org.apache.iceberg.rest.ResourcePaths;
import org.apache.iceberg.rest.auth.AuthSession;

public class RestTable extends BaseTable {
private final RESTClient client;
private final String path;
private final Supplier<Map<String, String>> headers;
private final MetricsReporter reporter;
private final ResourcePaths resourcePaths;
private final TableIdentifier tableIdentifier;
private final AuthSession authSession;

public RestTable(
TableOperations ops,
String name,
MetricsReporter reporter,
RESTClient client,
AuthSession authSession,
String path,
Supplier<Map<String, String>> headers,
TableIdentifier tableIdentifier,
ResourcePaths resourcePaths) {
super(ops, name, reporter);
this.reporter = reporter;
this.client = client;
this.headers = headers;
this.path = path;
this.tableIdentifier = tableIdentifier;
this.resourcePaths = resourcePaths;
this.authSession = authSession;
}

@Override
public TableScan newScan() {
// TODO when looking at ImmutableTableScanContext how do we ensure
// correct snapshotId to use for point in time cases. When looking at spark
// it seems it follows similar approach, see class SparkDistributedDataScan

return new RestTableScan(
this,
schema(),
ImmutableTableScanContext.builder().metricsReporter(reporter).build(),
client,
authSession,
path,
headers,
operations(),
tableIdentifier,
resourcePaths);
}
}
Loading
Loading