code clean-up based on PR review

onthegomap · msbarry · Jan 14, 2023 · Jan 10, 2023 · Jan 10, 2023 · Jan 10, 2023
commit 17935d2fabb168f6ebbf58e7da167d0f9860563e
diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/VectorTile.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/VectorTile.java
@@ -523,17 +523,22 @@ private boolean containsOnlyFillsOrEdges(boolean allowEdges) {
     return !empty;
   }
 
-  public boolean likelyToBeDuplicate() {
-    if (layers.size() <= 0) {
-      return true;
-    } else if (layers.size() == 1 && layers.values().stream().findFirst().get().encodedFeatures.isEmpty()) {
-      return true;
-    }
-    var result = containsOnlyFillsOrEdges();
-    if (result) {
-      return true;
-    }
-    return result;
+  /**
+   * Determine whether a tile is likely to be a duplicate of some other tile hence it makes sense to calculate a hash
+   * for it.
+   * <p>
+   * Deduplication code is aiming for a balance between filtering-out all duplicates and not spending too much CPU on
+   * hash calculations: calculating hashes for all tiles costs too much CPU, not calculating hashes at all means
+   * generating mbtiles which are too big. This method is responsible for achieving that balance.
+   * <p>
+   * Current understanding is, that for the whole planet, there are 267m total tiles and 38m unique tiles. The
+   * {@link #containsOnlyFills()} heuristic catches >99.9% of repeated tiles and cuts down the number of tile hashes we
+   * need to track by 98% (38m to 735k). So it is considered a good tradeoff.
+   *
+   * @return {@code true} if the tile might have duplicates hence we want to calculate a hash for it
+   */
+  public boolean likelyToBeDuplicated() {
+    return layers.values().stream().allMatch(v -> v.encodedFeatures.isEmpty()) || containsOnlyFills();
   }
 
   private enum Command {

diff --git a/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java b/planetiler-core/src/main/java/com/onthegomap/planetiler/mbtiles/MbtilesWriter.java
@@ -290,7 +290,7 @@ private void tileEncoder(Iterable<TileBatch> prev, Consumer<TileBatch> next) thr
           lastEncoded = encoded;
           lastBytes = bytes;
           last = tileFeatures;
-          if (compactDb && en.likelyToBeDuplicate() && bytes != null) {
+          if (compactDb && en.likelyToBeDuplicated() && bytes != null) {
             tileDataHash = generateContentHash(bytes);
           } else {
             tileDataHash = null;
@@ -419,9 +419,7 @@ private long tilesEmitted() {
    * Used as an optimization to avoid writing the same (mostly ocean) tiles over and over again.
    */
   public static long generateContentHash(byte[] bytes) {
-    long hash = Hashing.FNV1_64_INIT;
-    hash = Hashing.fnv1a64(hash, bytes);
-    return hash;
+    return Hashing.fnv1a64(bytes);
   }
 
   /**

diff --git a/planetiler-core/src/test/java/com/onthegomap/planetiler/collection/FeatureGroupTest.java b/planetiler-core/src/test/java/com/onthegomap/planetiler/collection/FeatureGroupTest.java
@@ -365,16 +365,16 @@ void testGenerateContentHash(String testName, boolean expectSame, PuTileArgs arg
     put(args1);
     sorter.sort();
     var iter = features.iterator();
-    var tile0_hash = MbtilesWriter.generateContentHash(
+    var tileHash0 = MbtilesWriter.generateContentHash(
       Gzip.gzip(iter.next().getVectorTileEncoder().encode())
     );
-    var tile1_hash = MbtilesWriter.generateContentHash(
+    var tileHash1 = MbtilesWriter.generateContentHash(
       Gzip.gzip(iter.next().getVectorTileEncoder().encode())
     );
     if (expectSame) {
-      assertEquals(tile0_hash, tile1_hash);
+      assertEquals(tileHash0, tileHash1);
     } else {
-      assertNotEquals(tile0_hash, tile1_hash);
+      assertNotEquals(tileHash0, tileHash1);
     }
   }