Skip to content

Commit

Permalink
benchmark using reinterpret for points and linestrings
Browse files Browse the repository at this point in the history
simple 2d for now but that can change.  a nice prototype is up on WKG.
  • Loading branch information
asinghvi17 committed Apr 11, 2024
1 parent 98c0ac8 commit 187ca41
Show file tree
Hide file tree
Showing 4 changed files with 191 additions and 2 deletions.
92 changes: 90 additions & 2 deletions src/reader.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,21 @@ struct __GeoPackageFile
source::SQLite.DB
end

"""
This dict encodes a lookup from geometry type strings as defined in
"""
const GEOMETRY_TYPE_LOOKUP = Dict{String, Type}(
"POINT" => GI.PointTrait,
"LINESTRING" => GI.LineStringTrait,
"POLYGON" => GI.PolygonTrait,
"MULTIPOINT" => GI.MultiPointTrait,
"MULTILINESTRING" => GI.MultiLineStringTrait,
"MULTIPOLYGON" => GI.MultiPolygonTrait,
"GEOMETRYCOLLECTION" => GI.GeometryCollectionTrait,
"GEOMETRY" => GI.AbstractGeometryTrait
)

#=
## Notes on working with SQLite.jl
Expand Down Expand Up @@ -50,7 +65,38 @@ function get_crs_table(source)
return crs_df
end

function _get_geometry_table(source, table_name, geometry_column, srs_id, crs_table)
function get_geometry_table(source, table_name, crs_table = get_crs_table(source))
global to
@timeit to "Table retrieval" begin
table_query = """
SELECT gpkg_geometry_columns.column_name, gpkg_geometry_columns.geometry_type_name, gpkg_contents.srs_id
FROM gpkg_geometry_columns
LEFT JOIN gpkg_contents ON gpkg_geometry_columns.table_name = gpkg_contents.table_name;
"""
table_query_result = DBInterface.execute(source, table_query)
@timeit to "Materialization" result = first(table_query_result)
end
geometry_type = GEOMETRY_TYPE_LOOKUP[result[:geometry_type_name]]
return _get_geometry_table(geometry_type, source, table_name, result[:column_name], result[:srs_id], crs_table)
end

function _get_geometry_table(geometry_type::Type{T}, source, table_name, geometry_column, srs_id, crs_table) where T <: GI.AbstractTrait
global to
@timeit to "Table retrieval" begin
table_query = "SELECT * FROM $table_name;"
table_query_result = DBInterface.execute(source, table_query)
@timeit to "Materialization" result = DataFrame(table_query_result)
end

@timeit to "WKB parsing" begin
result[!, geometry_column] = parse_geopkg_wkb.(result[!, geometry_column]; crs_table = crs_table)
end
DataFrames.metadata!(result, "GeoPackage.jl SRS data", crs_table)
DataFrames.metadata!(result, "GeoPackage.jl default SRS", srs_id)
return result
end

function _get_geometry_table(geometry_type::Type{GI.PointTrait}, source, table_name, geometry_column, srs_id, crs_table)
global to
@timeit to "Table retrieval" begin
table_query = "SELECT * FROM $table_name;"
Expand Down Expand Up @@ -159,7 +205,49 @@ function parse_geopkg_wkb(wkb::Vector{UInt8}; crs_table)
return final_geom
end


function _easy_get_point(wkb::Vector{UInt8})
# Check that the magic number is correct
@assert wkb[1] == 0x47 && wkb[2] == 0x50 "The magic bits for a GeoPackage WKB are not present. Expected `[0x47, 0x50]`, got `$(wkb[1:2])`."
# Get version and flag bytes
version = wkb[3]
flag_byte = wkb[4]
# Expand the flag byte into its components
is_extended_gpkg = (flag_byte & 0b00100000) == 0b00100000 # is the GeoPackage extended?
is_empty = (flag_byte & 0b0001000) == 0b0001000 # is the geometry empty? TODO: short-circuit here, return an empty geom. May be a NaN geom instead.
envelope_size_byte = flag_byte << 4 >> 5 # what is the category of the envelope? See note above.
is_little_endian = (flag_byte & 0b00000001) == 0b00000001 # is the WKB little-endian? If not we can't yet handle it in native Julia.

# Calculate envelope size according to the definition in the GeoPackage spec
envelope_size = if envelope_size_byte == 0
0
elseif envelope_size_byte == 1
32
elseif envelope_size_byte == 2
48
elseif envelope_size_byte == 3
48
elseif envelope_size_byte == 4
64
else
error("Invalid envelope size byte: $envelope_size_byte. Specifically, the number evaluated to $(envelope_size_byte), which is in the invalid region between 5 and 7.")
end
# Obtain the SRS ID
srs_id = only(reinterpret(Int32, wkb[5:8]))
# Look this ID up and convert it into a CRS object from `GeoFormatTypes`.
crs_row = findfirst(==(srs_id), crs_table.srs_id)
# We've preprocessed all available CRSs in the geopackage file into
# GeoFormatTypes objects, so we can just index into that table.
crs_obj = crs_table[crs_row, :gft]
# Calculate the number of bytes in the GeoPackage spec header.
# This is dynamic, as it depends on the envelope size.
header_length = 4 #= length of original flags =# +
4 #= length of CRS indicator as Int32 =# +
envelope_size #= size of envelope as given =#
# We index into the WKB to get the actual geometry.
# We need to start from the byte _after_ the header, i.e., `header_length+1`.
final_geom = GI.Point(reinterpret(Float64, wkb[(header_length+1+5):end]); crs = crs_obj)
return final_geom
end

# @b _get_geometry_tables(joinpath(dirname(@__DIR__), "test", "data", "polygon.gpkg")) seconds=3 # 313 μs
# ────────────────────────────────────────────────────────────────────────────────
Expand Down
14 changes: 14 additions & 0 deletions src/types.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""
"""
struct DB
handle::SQLite.DB
crs_dict::Dict{Int, Any}
geometry_tables::NTuple{String}
end

function gettable(db::DB, table_name::String = first(db.geometry_tables))
@assert table_name in db.geometry_tables "Table $table_name is not a geometry table. Available geometry tables are $(db.geometry_tables)."

table_crs = "SE"
end

86 changes: 86 additions & 0 deletions test/benchmark_parsing.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import GeoFormatTypes as GFT, WellKnownGeometry as WKG, GeoInterface as GI

# db = SQLite.DB("/Users/anshul/git/vector-benchmark/data/points.gpkg")

# crs_table = get_crs_table(db)

# table_query = """
# SELECT gpkg_geometry_columns.table_name, gpkg_geometry_columns.column_name, gpkg_geometry_columns.geometry_type_name, gpkg_contents.srs_id
# FROM gpkg_geometry_columns
# LEFT JOIN gpkg_contents ON gpkg_geometry_columns.table_name = gpkg_contents.table_name;
# """
# table_query_result = DBInterface.execute(db, table_query)
# contents_df = DataFrame(table_query_result)

# geoms_table = Tables.columntable(DBInterface.execute(db, "SELECT $(contents_df[1, :column_name]) FROM $(contents_df[1, :table_name])"))

# geoms = geoms_table.geom

tups = tuple.(rand(300_000), rand(300_000))

geoms = GFT.val.(WKG.getwkb.(GI.Point.(tups)))

# Parsing approach 1 - using WellKnownGeometry directly

GFT.WellKnownBinary(GFT.Geom(), view(geoms[1], (8+1):length(geoms[1])))
# This is a bit unfair because a lot of the parsing work would have happened earlier in approach 2,
# but even that is at most 1ms.
@benchmark begin
GO.tuples(GFT.WellKnownBinary.((GFT.Geom(),), getindex.($geoms, ((8+1):length($(geoms[1])),))))
end

# BenchmarkTools.Trial: 19 samples with 1 evaluation.
# Range (min … max): 254.759 ms … 283.244 ms ┊ GC (min … max): 6.17% … 15.63%
# Time (median): 270.962 ms ┊ GC (median): 10.07%
# Time (mean ± σ): 269.855 ms ± 8.431 ms ┊ GC (mean ± σ): 10.96% ± 3.00%

# ▁ █ ▁ ▁ ▁ ▁ ▁ ▁ ▁ █ ▁ ▁ ▁ ▁ ▁ ▁ ▁
# █▁▁▁▁▁▁▁█▁▁▁█▁█▁▁▁▁▁▁▁█▁█▁▁█▁▁▁█▁▁█▁█▁▁▁█▁▁▁▁▁█▁█▁█▁▁▁█▁█▁▁▁█ ▁
# 255 ms Histogram: frequency by time 283 ms <

# Parsing approach 2 - reinterpret the bytes as a tuple

@benchmark begin
map($(geoms)) do geom
only(reinterpret(Tuple{Float64, Float64}, view(geom, (8+1+5):length(geom))))
end
end

ls = GI.LineString(map((geoms)) do geom
only(reinterpret(Tuple{Float64, Float64}, view(geom, (8+1+5):length(geom)))) # this should be 5+1 if we are parsing from WKB
end)

ls_as_wkb = GFT.val(WKG.getwkb(ls))

# Parsing approach 1 - reduce to GO.tuples
@benchmark collect(GI.getpoint(GFT.WellKnownBinary(GFT.Geom(), $ls_as_wkb)))

# Parsing approach 2 - reduce to reinterpret
@benchmark collect(reinterpret(Tuple{Float64, Float64}, view($ls_as_wkb, (1+4+4+1):length($ls_as_wkb))))


# BenchmarkTools.Trial: 10000 samples with 334 evaluations.
# Range (min … max): 258.111 ns … 3.375 μs ┊ GC (min … max): 0.00% … 89.28%
# Time (median): 274.826 ns ┊ GC (median): 0.00%
# Time (mean ± σ): 280.685 ns ± 97.172 ns ┊ GC (mean ± σ): 1.11% ± 2.94%

# ▁▄█▇▅▂ ▁ ▂▁
# ▁▁▂▃▃▃▄▃▅███████▇████▆▆▄▄▄▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▃
# 258 ns Histogram: frequency by time 326 ns <

# Memory estimate: 160 bytes, allocs estimate: 4.


# Parsing approach 3 - ArchGDAL
@benchmark ArchGDAL.fromWKB(ls_as_wkb)

# BenchmarkTools.Trial: 3790 samples with 1 evaluation.
# Range (min … max): 399.750 μs … 1.036 s ┊ GC (min … max): 0.00% … 0.00%
# Time (median): 492.834 μs ┊ GC (median): 0.00%
# Time (mean ± σ): 1.317 ms ± 17.177 ms ┊ GC (mean ± σ): 0.00% ± 0.00%

# █▆▂▁▁▁ ▁ ▁ ▁ ▁
# ██████▇██▇███████████▆▄▆▄▆▄▄▄▅▄▅▄▅▅▄▄▁▄▄▄▃▁▅▄▄▄▁▄▄▁▃▃▁▃▃▁▃▁▄ █
# 400 μs Histogram: log(frequency) by time 7.67 ms <

# Memory estimate: 48 bytes, allocs estimate: 3.
1 change: 1 addition & 0 deletions test/ogc_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ _get_geometry_tables(source)




0 comments on commit 187ca41

Please sign in to comment.