Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Geoparquet filtering and simplification #895

Merged
merged 25 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
f618666
Add s3 transfer manager dependency
bchapuis Sep 30, 2024
f640c29
Improve spliterator concurrency
bchapuis Sep 30, 2024
21816d7
Improve the geoparquet reader
bchapuis Sep 30, 2024
2f53920
Improve the geoparquet reader
bchapuis Oct 5, 2024
c49861a
Make the spliterator an internal class
bchapuis Oct 5, 2024
07944ac
Remove unused writer class and use low level ParquetFileReader API
bchapuis Oct 7, 2024
79147b0
Merge GeoParquetGroup interface and implementation
bchapuis Oct 8, 2024
15b96e1
Remove the nanotime type
bchapuis Oct 8, 2024
443c90c
Distinguish types with single and repeated values
bchapuis Oct 8, 2024
21ea189
Remove the wrappers
bchapuis Oct 8, 2024
61f792c
Improve api
bchapuis Oct 8, 2024
26b41b1
Pass properties instead of parent object
bchapuis Oct 9, 2024
721bdb5
Add filtering capabilities
bchapuis Oct 9, 2024
49aebb8
Improve the creation of the filter predicate
bchapuis Oct 10, 2024
848dc35
Refactor the geoparquet reader
bchapuis Oct 10, 2024
19da919
Add some javadoc
bchapuis Oct 10, 2024
930cc0f
Move the config in a dedicated class
bchapuis Oct 10, 2024
8de3f0f
Move disabled tests in dedicated class
bchapuis Oct 11, 2024
ffbc74c
Fix issues in group
bchapuis Oct 12, 2024
461d30a
Add benchmarking module
bchapuis Oct 12, 2024
2d0eacf
Skip geoparquet files based on their bbox
bchapuis Oct 12, 2024
1325256
Format code
bchapuis Oct 12, 2024
1f9cc17
Fix sonar issue
bchapuis Oct 12, 2024
985ef2c
Suppress warnings in benchmarks
bchapuis Oct 13, 2024
cdb9f6a
Improve documentation and remove println
bchapuis Oct 14, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ examples/openstreetmap/tiles/

examples/transformation/*.pbf

# Benchmarking
baremaps-benchmarking/data/

# Docs
.jekyll-cache/
_site/
Expand Down
64 changes: 64 additions & 0 deletions baremaps-benchmarking/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.baremaps</groupId>
<artifactId>baremaps</artifactId>
<version>0.7.4-SNAPSHOT</version>
</parent>

<artifactId>baremaps-benchmarking</artifactId>

<properties>
<jmh.version>1.37</jmh.version>
<maven.deploy.skip>true</maven.deploy.skip>
</properties>

<dependencies>
<dependency>
<groupId>org.apache.baremaps</groupId>
<artifactId>baremaps-geoparquet</artifactId>
</dependency>
<dependency>
<groupId>org.apache.baremaps</groupId>
<artifactId>baremaps-testing</artifactId>
</dependency>
<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-core</artifactId>
<version>${jmh.version}</version>
</dependency>
<dependency>
<groupId>org.openjdk.jmh</groupId>
<artifactId>jmh-generator-annprocess</artifactId>
<version>${jmh.version}</version>
</dependency>
</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.6.0</version>
<executions>
<execution>
<goals>
<goal>shade</goal>
</goals>
<phase>package</phase>
<configuration>
<finalName>benchmarks</finalName>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>org.openjdk.jmh.Main</mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.baremaps.benchmarking.geoparquet;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.TimeUnit;
import org.apache.baremaps.geoparquet.GeoParquetReader;
import org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.s3.S3Client;
import software.amazon.awssdk.services.s3.model.GetObjectRequest;
import software.amazon.awssdk.services.s3.model.ListObjectsV2Request;

@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 0)
@Measurement(iterations = 1)
public class OvertureMapsBenchmark {

private static Path directory = Path.of("baremaps-benchmarking/data/overturemaps");

public static void main(String[] args) throws RunnerException {
Options opt = new OptionsBuilder()
.include(OvertureMapsBenchmark.class.getSimpleName())
.forks(1)
.build();
new Runner(opt).run();
}

@Setup
public void setup() throws IOException {
if (!Files.exists(directory)) {
try (var client = S3Client.builder()
.region(Region.US_EAST_1)
.credentialsProvider(new AnonymousAWSCredentialsProvider())
.build()) {

var listRequest = ListObjectsV2Request.builder()
.bucket("overturemaps-us-west-2")
.prefix("release/2024-09-18.0/theme=addresses/")
.build();
var objects = client.listObjectsV2(listRequest).contents();
for (var object : objects) {
var key = object.key();
var name = key.substring(key.lastIndexOf("/") + 1);
var file = directory.resolve(name);
Files.createDirectories(file.getParent());
if (!Files.exists(file)) {
var getRequest = GetObjectRequest.builder()
.bucket("overturemaps-us-west-2")
.key(key)
.build();
client.getObject(getRequest, file);
}
}
}
}
}

@SuppressWarnings({"squid:S1481", "squid:S2201"})
@Benchmark
public void read() {
GeoParquetReader reader = new GeoParquetReader(directory.toUri());
reader.read().count();
}

@SuppressWarnings({"squid:S1481", "squid:S2201"})
@Benchmark
public void readParallel() {
GeoParquetReader reader = new GeoParquetReader(directory.toUri());
reader.readParallel().count();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.baremaps.benchmarking.geoparquet;


import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.TimeUnit;
import org.apache.baremaps.geoparquet.GeoParquetReader;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;

@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 0)
@Measurement(iterations = 1)
public class SmallFileBenchmark {

private Path source = Path.of("baremaps-testing/data/samples/example.parquet").toAbsolutePath();
private Path directory = Path.of("baremaps-benchmarking/data/small").toAbsolutePath();

public static void main(String[] args) throws RunnerException {
Options opt = new OptionsBuilder()
.include(SmallFileBenchmark.class.getSimpleName())
.forks(1)
.build();
new Runner(opt).run();
}

@Setup
public void setup() throws IOException {
if (!Files.exists(directory)) {
for (int i = 0; i < 1000; i++) {
Path target = directory.resolve(i + ".parquet");
Files.createDirectories(target.getParent());
Files.copy(source, target);
}
}
}

@SuppressWarnings({"squid:S1481", "squid:S2201"})
@Benchmark
public void read() {
GeoParquetReader reader =
new GeoParquetReader(Path.of("baremaps-benchmarking/data/small/*.parquet").toUri());
reader.read().count();
}

@SuppressWarnings({"squid:S1481", "squid:S2201"})
@Benchmark
public void readParallel() {
GeoParquetReader reader =
new GeoParquetReader(Path.of("baremaps-benchmarking/data/small/*.parquet").toUri());
reader.readParallel().count();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,12 @@ public class GeoParquetDataTable implements DataTable {

public GeoParquetDataTable(URI path) {
this.path = path;
}

private GeoParquetReader reader() {
if (reader == null) {
reader = new GeoParquetReader(path);
}
return reader;
this.reader = new GeoParquetReader(path);
}

@Override
public long size() {
return reader().size();
return reader.size();
}

@Override
Expand All @@ -66,8 +60,8 @@ public Stream<DataRow> stream() {

@Override
public Stream<DataRow> parallelStream() {
return reader().readParallel().map(group -> new DataRowImpl(
GeoParquetTypeConversion.asSchema(path.toString(), group.getSchema()),
return reader.readParallel().map(group -> new DataRowImpl(
GeoParquetTypeConversion.asSchema(path.toString(), group.getGeoParquetSchema()),
GeoParquetTypeConversion.asRowValues(group)));
}

Expand All @@ -76,7 +70,6 @@ public void clear() {
if (reader != null) {
reader = null;
}

if (schema != null) {
schema = null;
}
Expand All @@ -87,15 +80,15 @@ public DataSchema schema() {
if (schema == null) {
this.schema = GeoParquetTypeConversion.asSchema(
path.toString(),
reader().getGeoParquetSchema());
reader.getGeoParquetSchema());
return this.schema;
}
return schema;
}

public int srid(String column) {
try {
return reader().getGeoParquetMetadata().getSrid(column);
return reader.getGeoParquetMetadata().getSrid(column);
} catch (Exception e) {
throw new GeoParquetException("Fail to read the SRID from the GeoParquet metadata", e);
}
Expand Down
Loading
Loading