Tile distribution

angelcervera · angelcervera · commit 8c2cb2773204 · 2020-05-11T16:32:53.000+01:00
diff --git a/README.md b/README.md
@@ -4,6 +4,23 @@
 
 This repository contains different data analysis of data distribution in the OSM dataset.
 
+## Spark locally
+If you don't have access to a Spark cluster, it is possible to execute it locally.
+A laptop with 16Gb memory and 8 cores should be enough.
+In my case, I'm using a Desktop with 16cores and 32Gb RAM. Full specs at the very bottom.
+
+To start Spark in local mode, after download and uncompress:
+```shell script
+sbin/start-all.sh
+```
+
+To access to the UI: [http://localhost:8080/](http://localhost:8080/)
+
+To stop Spark in local mode:
+```shell script
+sbin/stop-all.sh
+```
+
 
 ## Extract blocks
 To be able to parallelize, lets extract all blocks. Full universe will take 4 minutes:
@@ -36,14 +53,35 @@ It will take around 30 minutes.
   -o file:///home/angelcc/Downloads/osm/planet/distribution/nodeId/100
 ```
 
+## Tile distribution
+Following, example of how to distribution report for tiles of 10000x10000, locally,
+using 5 cores and 4Gb per core. It will take around 30 minutes.
+
+```shell script
+/home/angelcc/apps/spark-2.4.5-bin-hadoop2.7/bin/spark-submit \
+  --class com.simplexportal.simplexspatial.analysis.Driver \
+  --master "spark://angelcc-B450-AORUS-ELITE:7077" \
+  --deploy-mode cluster \
+  --executor-memory 4G \
+  --total-executor-cores 5 \
+  --num-executors 1 \
+  target/scala-2.11/simplexspatial-data-distribution-analysis-assembly-0.1.jar \
+  tile \
+  --latPartitions 10000 \
+  --lonPartitions 10000 \
+  -i file:///home/angelcc/Downloads/osm/planet/blobs \
+  -o file:///home/angelcc/Downloads/osm/planet/distribution/tile/10000x10000
+```
 
 ## Zeppelin
-To start the notebook, from {root_project}/zeppelin:
+To start the notebook, from a temporal folder:
 ```shell script
-docker run -p 8080:8080 --rm \
+mkdir logs notebook
+docker run -p 8081:8080 --rm \
    -v $PWD/logs:/logs \
    -v $PWD/notebook:/notebook \
    -v /home/angelcc/Downloads/osm/planet/distribution/nodeId/100:/zeppelin/data/nodeId \
+   -v /home/angelcc/Downloads/osm/planet/distribution/tile/10000x10000:/zeppelin/data/tile \
    -e ZEPPELIN_LOG_DIR='/logs' \
    -e ZEPPELIN_NOTEBOOK_DIR='/notebook' \
    --name zeppelin \
diff --git a/src/main/scala/com/simplexportal/simplexspatial/analysis/AppConfig.scala b/src/main/scala/com/simplexportal/simplexspatial/analysis/AppConfig.scala
@@ -17,16 +17,15 @@
 
 package com.simplexportal.simplexspatial.analysis
 
-import com.acervera.osm4scala.model.OSMEntity
 import com.simplexportal.simplexspatial.analysis.AppConfig.{Command, NoneCmd}
 
 case class AppConfig(
     cmd: Command = NoneCmd,
     input: String = "",
     output: String = "",
-    modPartitions: Long = 0,
-    latPartitions: Long = 0,
-    lonPartitions: Long = 0
+    modPartitions: Int = 0,
+    latPartitions: Int = 0,
+    lonPartitions: Int = 0
 )
 
 object AppConfig {
@@ -65,28 +64,31 @@ object AppConfig {
     cmd(EXTRACT.id)
       .action((_, cfg) => cfg.copy(cmd = EXTRACT))
       .text("Extract blobs from osm pbf file")
+
     cmd(MOD.id)
       .action((_, cfg) => cfg.copy(cmd = MOD))
       .text("Calculate distribution using a module of the node id as partitioner.")
       .children(
-        opt[Long]("partitions")
+        opt[Int]("partitions")
           .abbr("p")
           .required()
           .action((v, args) => args.copy(modPartitions = v))
       )
+
     cmd(TILE.id)
       .action((_, cfg) => cfg.copy(cmd = TILE))
       .text("Calculate distribution partitioning data by Tile")
       .children(
-        opt[Long]("latPartitions")
+        opt[Int]("latPartitions")
           .abbr("latP")
           .required()
           .action((v, cfg) => cfg.copy(latPartitions = v)),
-        opt[Long]("lonPartitions")
+        opt[Int]("lonPartitions")
           .abbr("lonP")
           .required()
           .action((v, cfg) => cfg.copy(lonPartitions = v))
       )
+
     checkConfig {
       case cfg: AppConfig if cfg.cmd == "" => failure("partitioner not present.")
       case _                               => success
diff --git a/src/main/scala/com/simplexportal/simplexspatial/analysis/Driver.scala b/src/main/scala/com/simplexportal/simplexspatial/analysis/Driver.scala
@@ -18,9 +18,8 @@
 package com.simplexportal.simplexspatial.analysis
 
 import com.simplexportal.simplexspatial.analysis.AppConfig._
-import com.simplexportal.simplexspatial.analysis.NodeIdDistribution._
+import org.apache.spark.SparkConf
 import org.apache.spark.sql.SparkSession
-import org.apache.spark.{SparkConf, SparkContext}
 
 object Driver {
 
@@ -39,6 +38,9 @@ object Driver {
           case MOD =>
             NodeIdDistribution
               .run(cfg.input, cfg.output, cfg.modPartitions)
+          case AppConfig.TILE =>
+            TileDistribution
+              .run(cfg.input, cfg.output, cfg.latPartitions, cfg.lonPartitions)
           case EXTRACT =>
             println(s"Extracted ${BlocksExtraction.extractBlobs(cfg.input, cfg.output)} blocks")
           case x =>
diff --git a/src/main/scala/com/simplexportal/simplexspatial/analysis/NodeIdDistribution.scala b/src/main/scala/com/simplexportal/simplexspatial/analysis/NodeIdDistribution.scala
@@ -49,10 +49,10 @@ object NodeIdDistribution {
       entities
         .withColumnRenamed("_1", "partition")
         .withColumnRenamed("_2", "id")
-        .createTempView("ids_per_partition")
+        .createTempView("mod_ids_per_partition")
 
       sparkSession.sql(
-        "select partition, max(id) as maxId, min(id) as minId, count(*) as ids from ids_per_partition group by partition"
+        "select partition, max(id) as maxId, min(id) as minId, count(*) as ids from mod_ids_per_partition group by partition"
       )
     }
   }
diff --git a/src/main/scala/com/simplexportal/simplexspatial/analysis/TileDistribution.scala b/src/main/scala/com/simplexportal/simplexspatial/analysis/TileDistribution.scala
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2020 Ángel Cervera Claudio
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package com.simplexportal.simplexspatial.analysis
+
+import com.acervera.osm4scala.model.{NodeEntity, OSMEntity}
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
+import org.slf4j.LoggerFactory
+
+object TileDistribution {
+  val logger = LoggerFactory.getLogger(TileDistribution.getClass.getName)
+
+  def run(input: String, output: String, latPartitions: Int, lonPartitions: Int)(
+      implicit ctx: SparkContext,
+      spark: SparkSession
+  ): Unit = {
+    import spark.implicits._
+    Common
+      .fromBlobs(input, extractor(latPartitions, lonPartitions))
+      .toDS
+      .distribution()
+      .write
+      .orc(output)
+  }
+
+  def extractor(
+      latPartitions: Int,
+      lonPartitions: Int,
+      decimalPrecision: Byte = 6
+  ): OSMEntity => Option[(Long, Long)] = {
+    val PRECISION_ROUNDING: Int = Math.pow(10, decimalPrecision).toInt
+
+    def latPartition(lat: Double): Int =
+      ((lat + 90) * PRECISION_ROUNDING).toInt / ((180 * PRECISION_ROUNDING) / latPartitions)
+
+    def lonPartition(lon: Double): Int =
+      ((lon + 180) * PRECISION_ROUNDING).toInt / ((360 * PRECISION_ROUNDING) / lonPartitions)
+
+    (entity: OSMEntity) =>
+      entity match {
+        case node: NodeEntity =>
+          Some((latPartition(node.latitude) << 16 | lonPartition(node.longitude), node.id))
+        case _ => None
+      }
+  }
+
+  implicit class Entities(entities: Dataset[(Long, Long)])(implicit sparkSession: SparkSession) {
+
+    def distribution(): DataFrame = {
+      entities
+        .withColumnRenamed("_1", "partition")
+        .withColumnRenamed("_2", "id")
+        .createTempView("tile_ids_per_partition")
+
+      val mask = 0x0000ffff
+
+      sparkSession.sql(
+        "select" +
+          " int(shiftrightunsigned(partition, 16)) as latPart," +
+          " int(partition & 4095) as lonPart," +
+          " max(id) as maxId," +
+          " min(id) as minId," +
+          " count(*) as ids" +
+          " from tile_ids_per_partition group by partition"
+      )
+    }
+  }
+
+}
diff --git a/src/test/scala/com/simplexportal/simplexspatial/analysis/NodeIdDistributionSpec.scala b/src/test/scala/com/simplexportal/simplexspatial/analysis/NodeIdDistributionSpec.scala
@@ -17,17 +17,15 @@
 
 package com.simplexportal.simplexspatial.analysis
 
-import com.acervera.osm4scala.model.{NodeEntity, OSMEntity, WayEntity}
+import com.acervera.osm4scala.model.{NodeEntity, WayEntity}
 import org.apache.spark.sql.Row
 import org.scalatest.matchers.should.Matchers
 
-import scala.util.Random
-
 class NodeIdDistributionSpec
     extends org.scalatest.wordspec.AnyWordSpecLike
     with Matchers
     with SparkBaseSQLTesting {
-  "Calculate distribution" should {
+  "Calculate mod of id distribution" should {
     "correctly" in {
 
       import NodeIdDistribution._
@@ -47,7 +45,7 @@ class NodeIdDistributionSpec
         .createDataset(data.flatMap(NodeIdDistribution.extractor(10)(_)))
         .distribution
 
-      result.sort($"partition".asc).collect().toSet shouldBe (
+      result.collect().toSet shouldBe (
         Set(
           Row(0, 30, 10, 3),
           Row(1, 21, 11, 2),
diff --git a/src/test/scala/com/simplexportal/simplexspatial/analysis/TileDistributionSpec.scala b/src/test/scala/com/simplexportal/simplexspatial/analysis/TileDistributionSpec.scala
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2020 Ángel Cervera Claudio
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+package com.simplexportal.simplexspatial.analysis
+
+import com.acervera.osm4scala.model.{NodeEntity, WayEntity}
+import org.apache.spark.sql.Row
+import org.scalatest.matchers.should.Matchers
+
+class TileDistributionSpec
+    extends org.scalatest.wordspec.AnyWordSpecLike
+    with Matchers
+    with SparkBaseSQLTesting {
+  "Calculate Tile distribution" should {
+    "correctly" in {
+
+      import TileDistribution._
+      import sparkSession.implicits._
+
+      val data = Seq(
+        NodeEntity(10, -10.1, -10.1, Map.empty),
+        NodeEntity(20, -20.1, -10.1, Map.empty),
+        NodeEntity(30, -30.1, -10.1, Map.empty),
+        NodeEntity(11, -11.1, -10.1, Map.empty),
+        NodeEntity(21, 21.1, 10.1, Map.empty),
+        NodeEntity(32, 32.1, 10.1, Map.empty),
+        WayEntity(100, Seq(10, 20, 30), Map.empty)
+      )
+
+      val result = sparkSession
+        .createDataset(data.flatMap(TileDistribution.extractor(2, 2)(_)))
+        .distribution
+
+      result.collect().toSet shouldBe (
+        Set(
+          Row(0, 0, 30, 10, 4),
+          Row(1, 1, 32, 21, 2)
+        )
+      )
+    }
+  }
+}

Original file line number	Diff line number	Diff line change
`@@ -49,10 +49,10 @@ object NodeIdDistribution {`
`49`	`49`	`entities`
`50`	`50`	`.withColumnRenamed("_1", "partition")`
`51`	`51`	`.withColumnRenamed("_2", "id")`
`52`		`- .createTempView("ids_per_partition")`
	`52`	`+ .createTempView("mod_ids_per_partition")`
`53`	`53`
`54`	`54`	`sparkSession.sql(`
`55`		`- "select partition, max(id) as maxId, min(id) as minId, count(*) as ids from ids_per_partition group by partition"`
	`55`	`+ "select partition, max(id) as maxId, min(id) as minId, count(*) as ids from mod_ids_per_partition group by partition"`
`56`	`56`	`)`
`57`	`57`	`}`
`58`	`58`	`}`