diff --git a/src/main/scala/magellan/ShapefileRelation.scala b/src/main/scala/magellan/ShapefileRelation.scala index c99115d..7697dc9 100644 --- a/src/main/scala/magellan/ShapefileRelation.scala +++ b/src/main/scala/magellan/ShapefileRelation.scala @@ -20,11 +20,12 @@ import java.util.Objects import magellan.io._ import magellan.mapreduce._ -import org.apache.hadoop.io.{MapWritable, Text} +import org.apache.hadoop.io.{ArrayWritable, LongWritable, MapWritable, Text} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import scala.collection.JavaConversions._ +import scala.util.Try /** * A Shapefile relation is the entry point for working with Shapefile formats. @@ -37,6 +38,27 @@ case class ShapeFileRelation( protected override def _buildScan(): RDD[Array[Any]] = { + // read the shx files, if they exist + val fileNameToFileSplits = Try(sc.newAPIHadoopFile( + path + "/*.shx", + classOf[ShxInputFormat], + classOf[Text], + classOf[ArrayWritable] + ).map { case (txt: Text, splits: ArrayWritable) => + val fileName = txt.toString + val s = splits.get() + val size = s.length + var i = 0 + val v = Array.fill(size)(0L) + while (i < size) { + v.update(i, s(i).asInstanceOf[LongWritable].get()) + i += 1 + } + (fileName, v) + }.collectAsMap()) + + fileNameToFileSplits.map(SplitInfos.SPLIT_INFO_MAP.set(_)) + val shapefileRdd = sqlContext.sparkContext.newAPIHadoopFile( path + "/*.shp", classOf[ShapeInputFormat], diff --git a/src/main/scala/magellan/io/ShapeWritable.scala b/src/main/scala/magellan/io/ShapeWritable.scala index 73e7dcf..3709b1d 100644 --- a/src/main/scala/magellan/io/ShapeWritable.scala +++ b/src/main/scala/magellan/io/ShapeWritable.scala @@ -22,7 +22,7 @@ import magellan.Shape import org.apache.commons.io.EndianUtils import org.apache.hadoop.io.Writable -private[magellan] class ShapeWritable(shapeType: Int) extends Writable { +private[magellan] class ShapeWritable extends Writable { var shape: Shape = _ @@ -32,8 +32,6 @@ private[magellan] class ShapeWritable(shapeType: Int) extends Writable { override def readFields(dataInput: DataInput): Unit = { val shapeType = EndianUtils.swapInteger(dataInput.readInt()) - // all records share the same type or nullshape. - require(this.shapeType == shapeType || shapeType == 0) val h = shapeType match { case 0 => new NullShapeReader() case 1 => new PointReader() diff --git a/src/main/scala/magellan/mapreduce/ShapeInputFormat.scala b/src/main/scala/magellan/mapreduce/ShapeInputFormat.scala index f5c9cb7..95364dd 100644 --- a/src/main/scala/magellan/mapreduce/ShapeInputFormat.scala +++ b/src/main/scala/magellan/mapreduce/ShapeInputFormat.scala @@ -16,20 +16,76 @@ package magellan.mapreduce -import org.apache.hadoop.fs.Path +import com.google.common.base.Stopwatch +import magellan.io.{ShapeKey, ShapeWritable} +import org.apache.commons.logging.LogFactory +import org.apache.hadoop.fs.{LocatedFileStatus, Path} import org.apache.hadoop.mapreduce.lib.input._ import org.apache.hadoop.mapreduce.{InputSplit, JobContext, TaskAttemptContext} -import magellan.io.{ShapeWritable, ShapeKey} +import scala.collection.JavaConversions._ +import scala.collection.mutable.ListBuffer -private[magellan] class ShapeInputFormat extends FileInputFormat[ShapeKey, ShapeWritable] { +private[magellan] class ShapeInputFormat + extends FileInputFormat[ShapeKey, ShapeWritable] { + + private val log = LogFactory.getLog(classOf[ShapeInputFormat]) override def createRecordReader(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) = { new ShapefileReader } - // TODO: Use DBIndex to figure out how to efficiently split files. - override def isSplitable(context: JobContext, filename: Path): Boolean = false + override def isSplitable(context: JobContext, filename: Path): Boolean = true + + override def getSplits(job: JobContext): java.util.List[InputSplit] = { + val splitInfos = SplitInfos.SPLIT_INFO_MAP.get() + computeSplits(job, splitInfos) + } + + private def computeSplits( + job: JobContext, + splitInfos: scala.collection.Map[String, Array[Long]]) = { + val sw = new Stopwatch().start + val splits = ListBuffer[InputSplit]() + val files = listStatus(job) + for (file <- files) { + val path = file.getPath + val length = file.getLen + val blkLocations = if (file.isInstanceOf[LocatedFileStatus]) { + file.asInstanceOf[LocatedFileStatus].getBlockLocations + } else { + val fs = path.getFileSystem(job.getConfiguration) + fs.getFileBlockLocations(file, 0, length) + } + val key = path.getName.split("\\.shp$")(0) + if (splitInfos == null || !splitInfos.containsKey(key)) { + val blkIndex = getBlockIndex(blkLocations, 0) + splits.+= (makeSplit(path, 0, length, blkLocations(blkIndex).getHosts, + blkLocations(blkIndex).getCachedHosts)) + } else { + val s = splitInfos(key).toSeq + val start = s + val end = s.drop(1) ++ Seq(length) + start.zip(end).foreach { case (startOffset: Long, endOffset: Long) => + val blkIndex = getBlockIndex(blkLocations, startOffset) + splits.+=(makeSplit(path, startOffset, endOffset - startOffset, blkLocations(blkIndex).getHosts, + blkLocations(blkIndex).getCachedHosts)) + } + } + } + sw.stop + if (log.isDebugEnabled) { + log.debug("Total # of splits generated by getSplits: " + splits.size + ", TimeTaken: " + sw.elapsedMillis) + } + splits + } } + +object SplitInfos { + + // TODO: Can we get rid of this hack to pass split calculation to the Shapefile Reader? + val SPLIT_INFO_MAP = new ThreadLocal[scala.collection.Map[String, Array[Long]]] + +} \ No newline at end of file diff --git a/src/main/scala/magellan/mapreduce/ShapefileReader.scala b/src/main/scala/magellan/mapreduce/ShapefileReader.scala index f1b811b..419315f 100644 --- a/src/main/scala/magellan/mapreduce/ShapefileReader.scala +++ b/src/main/scala/magellan/mapreduce/ShapefileReader.scala @@ -32,11 +32,9 @@ private[magellan] class ShapefileReader extends RecordReader[ShapeKey, ShapeWrit private var dis: DataInputStream = _ - private var length: BigInt = _ - private var remaining: BigInt = _ - override def getProgress: Float = remaining.toFloat / length.toFloat + override def getProgress: Float = 0 override def nextKeyValue(): Boolean = { if (remaining <= 0) { @@ -47,7 +45,7 @@ private[magellan] class ShapefileReader extends RecordReader[ShapeKey, ShapeWrit val recordNumber = dis.readInt() // record numbers begin at 1 require(recordNumber > 0) - val contentLength = 16 * (dis.readInt() + 4) + val contentLength = 2 * (dis.readInt() + 4) value.readFields(dis) remaining -= contentLength key.setRecordIndex(key.getRecordIndex() + 1) @@ -60,27 +58,26 @@ private[magellan] class ShapefileReader extends RecordReader[ShapeKey, ShapeWrit override def initialize(inputSplit: InputSplit, taskAttemptContext: TaskAttemptContext) { val split = inputSplit.asInstanceOf[FileSplit] val job = MapReduceUtils.getConfigurationFromContext(taskAttemptContext) - val start = split.getStart() - val end = start + split.getLength() - val file = split.getPath() - val fs = file.getFileSystem(job) - val is = fs.open(split.getPath()) + + val path = split.getPath() + val fs = path.getFileSystem(job) + val is = fs.open(path) + + val (start, end) = { + val v = split.getStart + if (v == 0) { + is.seek(24) + (100L, 2 * is.readInt().toLong) + } else { + (v, v + split.getLength) + } + } + + is.seek(start) dis = new DataInputStream(is) - require(is.readInt() == 9994) - // skip the next 20 bytes which should all be zero - 0 until 5 foreach {_ => require(is.readInt() == 0)} - // file length in bits - val i: BigInt = is.readInt() - length = 16 * i - 50 * 16 - remaining = length - val version = EndianUtils.swapInteger(is.readInt()) - require(version == 1000) - // shape type: all the shapes in a given split have the same type - val shapeType = EndianUtils.swapInteger(is.readInt()) key.setFileNamePrefix(split.getPath.getName.split("\\.")(0)) - value = new ShapeWritable(shapeType) - // skip the next 64 bytes - 0 until 8 foreach {_ => is.readDouble()} + value = new ShapeWritable() + remaining = (end - start) } override def getCurrentKey: ShapeKey = key diff --git a/src/main/scala/magellan/mapreduce/ShxInputFormat.scala b/src/main/scala/magellan/mapreduce/ShxInputFormat.scala new file mode 100644 index 0000000..75641dd --- /dev/null +++ b/src/main/scala/magellan/mapreduce/ShxInputFormat.scala @@ -0,0 +1,130 @@ +/** + * Copyright 2015 Ram Sriharsha + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package magellan.mapreduce + +import java.io.DataInputStream + +import org.apache.commons.io.EndianUtils +import org.apache.hadoop.fs.Path +import org.apache.hadoop.io._ +import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit} +import org.apache.hadoop.mapreduce.{InputSplit, JobContext, RecordReader, TaskAttemptContext} + +import scala.collection.mutable.ListBuffer + +class ShxInputFormat extends FileInputFormat[Text, ArrayWritable] { + + override def createRecordReader( + split: InputSplit, + context: TaskAttemptContext): RecordReader[Text, ArrayWritable] = { + new ShxReader() + } + + override def isSplitable(context: JobContext, filename: Path): Boolean = false +} + +class ShxReader extends RecordReader[Text, ArrayWritable] { + + private var dis: DataInputStream = _ + + override def getProgress: Float = ??? + + private var done: Boolean = false + + private var splits:ArrayWritable = _ + + private var key: Text = new Text() + + private val MAX_SPLIT_SIZE = "mapreduce.input.fileinputformat.split.maxsize" + + private val MIN_SPLIT_SIZE = "mapreduce.input.fileinputformat.split.minsize" + + + override def nextKeyValue(): Boolean = if (done) false else { + done = true + true + } + + override def getCurrentValue: ArrayWritable = { + splits + } + + override def initialize(inputSplit: InputSplit, context: TaskAttemptContext): Unit = { + val split = inputSplit.asInstanceOf[FileSplit] + val job = MapReduceUtils.getConfigurationFromContext(context) + val start = split.getStart() + val end = start + split.getLength() + val path = split.getPath() + val fs = path.getFileSystem(job) + key.set(split.getPath.getName.split("\\.")(0)) + val is = fs.open(path) + dis = new DataInputStream(is) + require(is.readInt() == 9994) + // skip the next 20 bytes which should all be zero + 0 until 5 foreach {_ => require(is.readInt() == 0)} + // file length in bits + val len = is.readInt() + val numRecords = (2 * len - 100) / 8 + + val version = EndianUtils.swapInteger(is.readInt()) + require(version == 1000) + // shape type: all the shapes in a given split have the same type + is.readInt() + + // skip the next 64 bytes + 0 until 8 foreach {_ => is.readDouble()} + + // iterate over the offset and content length of each record + var j = 0 + val minSplitSize = job.getLong(MIN_SPLIT_SIZE, 1L) + val maxSplitSize = job.getLong(MAX_SPLIT_SIZE, Long.MaxValue) + val shpFileName = path.getName.replace("\\.shx$", "\\.shp") + val blockSize = fs.getFileStatus(new Path(path.getParent, shpFileName)).getBlockSize + val splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize)) + + // num bytes + val v = new ListBuffer[Writable]() + + var startOffset: Long = Long.MinValue + + while (j < numRecords) { + val offset = dis.readInt() + // skip the next 4 bytes (the content length) + dis.readInt() + + if (startOffset == Long.MinValue) { + startOffset = offset + } + else if (offset - startOffset > splitSize) { + v.+= (new LongWritable(startOffset * 2)) + startOffset = offset + } + j += 1 + } + + // if empty add starting offset + if (v.isEmpty) { + v.+= (new LongWritable(startOffset * 2)) + } + + splits = new ArrayWritable(classOf[LongWritable], v.toArray) + } + + override def getCurrentKey: Text = key + + override def close() {} +} diff --git a/src/test/resources/shapefiles/us_states/tl_2016_us_state.cpg b/src/test/resources/shapefiles/us_states/tl_2016_us_state.cpg new file mode 100755 index 0000000..3ad133c --- /dev/null +++ b/src/test/resources/shapefiles/us_states/tl_2016_us_state.cpg @@ -0,0 +1 @@ +UTF-8 \ No newline at end of file diff --git a/src/test/resources/shapefiles/us_states/tl_2016_us_state.dbf b/src/test/resources/shapefiles/us_states/tl_2016_us_state.dbf new file mode 100755 index 0000000..2853c97 Binary files /dev/null and b/src/test/resources/shapefiles/us_states/tl_2016_us_state.dbf differ diff --git a/src/test/resources/shapefiles/us_states/tl_2016_us_state.prj b/src/test/resources/shapefiles/us_states/tl_2016_us_state.prj new file mode 100755 index 0000000..747df58 --- /dev/null +++ b/src/test/resources/shapefiles/us_states/tl_2016_us_state.prj @@ -0,0 +1 @@ +GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137,298.257222101]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]] \ No newline at end of file diff --git a/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp b/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp new file mode 100755 index 0000000..ae0e8b5 Binary files /dev/null and b/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp differ diff --git a/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp.ea.iso.xml b/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp.ea.iso.xml new file mode 100755 index 0000000..c490af9 --- /dev/null +++ b/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp.ea.iso.xml @@ -0,0 +1,601 @@ + + + + Feature Catalog for the Current State and Equivalent National + + + The Current State and Equivalent National contains attributes for the primary governmental divisions of the United States. In addition to the fifty States, the Census Bureau treats the District of Columbia, Puerto Rico, and each of the Island Areas (American Samoa, the Commonwealth of the Northern Mariana Islands, Guam, and the U.S. Virgin Islands) as the statistical equivalents of States for the purpose of data presentation. + + + 2016 + + + 2016-06-01 + + + eng + + + utf8 + + + + + + + STATE.shp + + + Current State and Equivalent National entities + + + false + + + + + + REGION + + + Current Census region code + + + + + + + 1 + + + Northeast + + + + + + + + 2 + + + Midwest + + + + + + + + 3 + + + South + + + + + + + + 4 + + + West + + + + + + + + 9 + + + Puerto Rico and the Island Areas + + + + + + + + + + DIVISION + + + Current Census division code + + + + + + + 0 + + + Puerto Rico and the Island Areas + + + + + + + + 1 + + + New England + + + + + + + + 2 + + + Middle Atlantic + + + + + + + + 3 + + + East North Central + + + + + + + + 4 + + + West North Central + + + + + + + + 5 + + + South Atlantic + + + + + + + + 6 + + + East South Central + + + + + + + + 7 + + + West South Central + + + + + + + + 8 + + + Mountain + + + + + + + + 9 + + + Pacific + + + + + + + + + + STATEFP + + + Current state Federal Information Processing Series (FIPS) code + + + + + + + National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing Series (FIPS) - States/State Equivalents + + + + + + + + + + STATENS + + + Current state GNIS code + + + + + + + INCITS 446:2008 (Geographic Names Information System (GNIS)), Identifying Attributes for Named Physical and Cultural Geographic Features (Except Roads and Highways) of the United States, Its Territories, Outlying Areas, and Freely Associated Areas, and the Waters of the Same to the Limit of the Twelve-Mile Statutory Zone + + + + + + + + + + + + + U.S. Geological Survey (USGS) + + + resourceProvider + + + + + + + + + + + + + + + + + + GEOID + + + State identifier; state FIPS code + + + + + + + National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing + + + + + + + + + + STUSPS + + + Current United States Postal Service state abbreviation + + + + + + + + + + + + + + U.S. Postal Service + + + resourceProvider + + + + + + + + + + + + + + Publication 28 - Postal Addressing Standards + + + + + + + + + + + + + U.S. Postal Service + + + resourceProvider + + + + + + + + + + + + + + + + + + NAME + + + Current state name + + + + + + + National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing Series (FIPS) - States/State Equivalents + + + + + + + + + + LSAD + + + Current legal/statistical area description code for state + + + + + + + 00 + + + Blank + + + + + + + + + + MTFCC + + + MAF/TIGER feature class code + + + + + + + G4000 + + + State or Equivalent feature + + + + + + + + + + FUNCSTAT + + + Current functional status + + + + + + + A + + + Active government providing primary general-purpose functions + + + + + + + + + + ALAND + + + Current land area (square meters) + + + + + + + + Range Domain Minimum: 0 + Range Domain Maximum: 9,999,999,999,999 + + + + + + + + + AWATER + + + Current water area (square meters) + + + + + + + + Range Domain Minimum: 0 + Range Domain Maximum: 9,999,999,999,999 + + + + + + + + + INTPTLAT + + + Current latitude of the internal point + + + + + + + + Range Domain Minimum: -90.000000 + Range Domain Maximum: 90.000000 + + + + + + + + + INTPTLON + + + Current longitude of the internal point + + + + + + + + Range Domain Minimum: -180.000000 + Range Domain Maximum: 180.000000 + + + + + + + + \ No newline at end of file diff --git a/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp.iso.xml b/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp.iso.xml new file mode 100755 index 0000000..552b55b --- /dev/null +++ b/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp.iso.xml @@ -0,0 +1,581 @@ + + + + tl_2016_us_state.shp.iso.xml + + + eng + + + 8859part1 + + + +dataset + + + + + 2016-06-01 + + + ISO 19115 Geographic Information - Metadata + + + 2009-02-15 + + + http://www2.census.gov/geo/tiger/TIGER2016/STATE/tl_2016_us_state.zip + + + + + + + + + complex + + + 56 + + + + + + + + + + + + + Federal Information Processing Series (FIPS), Geographic Names Information System (GNIS), and feature names. + + + + + + + + + + + + TIGER/Line Shapefile, 2016, nation, U.S., Current State and Equivalent National + + + + + + 2016 + + + publication + + + + + + 2016 + + + + + + + + + The TIGER/Line shapefiles and related database files (.dbf) are an extract of selected geographic and cartographic information from the U.S. Census Bureau's Master Address File / Topologically Integrated Geographic Encoding and Referencing (MAF/TIGER) Database (MTDB). The MTDB represents a seamless national file with no overlaps or gaps between parts, however, each TIGER/Line shapefile is designed to stand alone as an independent data set, or they can be combined to cover the entire nation. + + +States and equivalent entities are the primary governmental divisions of the United States. In addition to the fifty States, the Census Bureau treats the District of Columbia, Puerto Rico, and each of the Island Areas (American Samoa, the Commonwealth of the Northern Mariana Islands, Guam, and the U.S. Virgin Islands) as the statistical equivalents of States for the purpose of data presentation. + + + + In order for others to use the information in the Census MAF/TIGER database in a geographic information system (GIS) or for other geographic applications, the Census Bureau releases to the public extracts of the database in the form of TIGER/Line Shapefiles. + + + + completed + + + + + + + + notPlanned + + + + + + + + http://tigerweb.geo.census.gov/arcgis/services/TIGERweb/tigerWMS_Current/MapServer/WmsServer?REQUEST=GetMap&SERVICE=WMS&VERSION=1.3.0&LAYERS=States,States Labels&STYLES=default,default&FORMAT=image/svg+xml&BGCOLOR=0xFFFFFF&TRANSPARENT=TRUE&CRS=EPSG:4326&BBOX=32.860571,-113.5097542,46.389131,-113.509754&WIDTH=891&HEIGHT=751 + + + URL for the TigerWeb Web Mapping Service + + + URL for the TigerWeb Web Mapping Service + + + + + + + NGDA + + + Governmental Units and Administrative and Statistical Boundaries Theme + + + National Geospatial Data Asset + + + theme + + + + + NGDA Portfolio Themes + + + + + + + + + + Nation + + + Polygon + + + theme + + + + + None + + + + + + + + + + +United States + + + + +U.S. + + + + place + + + + + ANSI INCITS 38:2009 (Formerly FIPS 5-2), + ANSI INCITS 31:2009 (Formerly FIPS 6-4),ANSI + INCITS 454:2009 (Formerly FIPS 8-6), ANSI INCITS + 455:2009(Formerly FIPS 9-1), ANSI INCITS 446:2008 (Geographic Names Information System (GNIS)) + + + + + + + + + + + + + otherRestrictions + + + + + + Access Constraints: None + + + Use Constraints:The TIGER/Line Shapefile products are not copyrighted however TIGER/Line and Census TIGER are registered trademarks of the U.S. Census Bureau. These products are free to use in a product or publication, however acknowledgement must be given to the U.S. Census Bureau as the source. +The boundary information in the TIGER/Line Shapefiles are for statistical data collection and tabulation purposes only; their depiction and designation for statistical purposes does not constitute a determination of jurisdictional authority or rights of ownership or entitlement and they are not legal land descriptions.Coordinates in the TIGER/Line shapefiles have six implied decimal places, but the positional accuracy of these coordinates is not as great as the six decimal places suggest. + + + + + + vector + + + eng + + + 8859part1 + + + boundaries + + + The TIGER/Line shapefiles contain geographic data only and do not include display mapping software or statistical data. For information on how to use the TIGER/Line shapefile data with specific software package users shall contact the company that produced the software. + + + + + + + -179.231086 + + + 179.859681 + + + -14.601813 + + + 71.441059 + + + + + + + + Publication Date + 2015-06 + 2016-05 + + + + + + + + + + + + + true + + + Current State and Equivalent + + + + + Feature Catalog for the 2016 TIGER/Line Shapefile Current State and Equivalent National + + + + + 2016 + + + + + + + + + http://meta.geo.census.gov/data/existing/decennial/GEO/GPMB/TIGERline/TIGER2016/state/tl_2016_state.shp.ea.iso.xml + + + + + + + + + + + TGRSHP (compressed) + + + + PK-ZIP, version 1.93 A or higher + + + + + + + html + + + + + + + + WMS + + + 1.3.0 + + + + + + + REST + + + + + + + + + + + The online copy of the TIGER/Line Shapefiles may be accessed without charge. + + + To obtain more information about ordering TIGER/Line Shapefiles visit http://www.census.gov/geo/www/tiger + + + + + + + + + + + http://www2.census.gov/geo/tiger/TIGER2016/STATE/tl_2016_us_state.zip + + + Shapefile Zip File + + + + + + + + + + + http://www.census.gov/geo/maps-data/data/tiger-line.html + + + TIGER/LineĀ® Shapefiles + + + Should be used for most mapping projects--this is our most comprehensive dataset. Designed for use with GIS (geographic information systems). + + + + + + + + + + + http://tigerweb.geo.census.gov/arcgis/services/TIGERweb/tigerWMS_Current/MapServer/WmsServer?REQUEST=GetMap&SERVICE=WMS&VERSION=1.3.0 &LAYERS=States,States Labels&STYLES=default,default&FORMAT=image/svg+xml&BGCOLOR=0xFFFFFF&TRANSPARENT=TRUE&CRS=EPSG:4326&BBOX=41.187053,-72.508142,42.88679,-69.858861&WIDTH=891&HEIGHT=751 + + + http://opengis.net/spec/wms + + + TIGERweb/tigerWMS_Current (MapServer) + + + This web mapping service contains the layer for the States and Equivalents. Since this layer displays at scales of 1:36,978,595 or greater, adjusting the BBox coordinates in the URL is necessary + + + download + + + + + + + + + + + + https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/State_County/MapServer + + + http://www.geoplatform.gov/spec/esri-map-rest + + + TIGERweb/State_County (MapServer) + + + This Rest Service contains the States and Equivalents Layers + + + download + + + + + + + + + + + + + + dataset + + + + + + + + + + + + Data completeness of the TIGER/Line Shapefiles reflects the contents of the Census MAF/TIGER database at the time the TIGER/Line Shapefiles were created. + + + + + + + + The Census Bureau performed automated tests to ensure logical consistency and limits of shapefiles. Segments making up the outer and inner boundaries of a polygon tie end-to-end to completely enclose the area. All polygons are tested for closure. +The Census Bureau uses its internally developed geographic update system to enhance and modify spatial and attribute data in the Census MAF/TIGER database. Standard geographic codes, such as FIPS codes for states, counties, municipalities, county subdivisions, places, American Indian/Alaska Native/Native Hawaiian areas, and congressional districts are used when encoding spatial entities. The Census Bureau performed spatial data tests for logical consistency of the codes during the compilation of the original Census MAF/TIGER database files. Most of the codes for geographic entities except states, counties, urban areas, Core Based Statistical Areas (CBSAs), American Indian Areas (AIAs), and congressional districts were provided to the Census Bureau by the USGS, the agency responsible for maintaining the Geographic Names Information System (GNIS). Feature attribute information has been examined but has not been fully tested for consistency. +For the TIGER/Line Shapefiles, the Point and Vector Object Count for the G-polygon SDTS Point and Vector Object Type reflects the number of records in the shapefile attribute table. For multi-polygon features, only one attribute record exists for each multi-polygon rather than one attribute record per individual G-polygon component of the multi-polygon feature. TIGER/Line Shapefile multi-polygons are an exception to the G-polygon object type classification. Therefore, when multi-polygons exist in a shapefile, the object count will be less than the actual number of G-polygons. + + + + + + + + + + TIGER/Line Shapefiles are extracted from the Census MAF/TIGER database by nation, state, county, and entity. Census MAF/TIGER data for all of the aforementioned geographic entities are then distributed among the shapefiles each containing attributes for line, polygon, or landmark geographic data. + + + 2016-01-01T00:00:00 + + + + + online + + + + + Census MAF/TIGER database + + + MAF/TIGER + + + + + 201605 + + + Publication Date + + + + + + + U.S. Department of Commerce, U.S. Census Bureau, Geography Division + + + originator + + + + + + Source Contribution: All line segments + + + + + + + + + + + + + + + + notPlanned + + + + This was transformed from the Census Metadata Import Format + + + + + \ No newline at end of file diff --git a/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp.xml b/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp.xml new file mode 100755 index 0000000..97ea225 --- /dev/null +++ b/src/test/resources/shapefiles/us_states/tl_2016_us_state.shp.xml @@ -0,0 +1,482 @@ + + + + + + U.S. Department of Commerce, U.S. Census Bureau, Geography + Division + 2016 + TIGER/Line Shapefile, 2016, nation, U.S., Current State and Equivalent National + 2016 + vector digital data + http://www2.census.gov/geo/tiger/TIGER2016/STATE/tl_2016_us_state.zip + + + + The TIGER/Line shapefiles and related database files (.dbf) are an extract of selected geographic and cartographic information from the U.S. Census Bureau's Master Address File / Topologically Integrated Geographic Encoding and Referencing (MAF/TIGER) Database (MTDB). The MTDB represents a seamless national file with no overlaps or gaps between parts, however, each TIGER/Line shapefile is designed to stand alone as an independent data set, or they can be combined to cover the entire nation. + + +States and equivalent entities are the primary governmental divisions of the United States. In addition to the fifty States, the Census Bureau treats the District of Columbia, Puerto Rico, and each of the Island Areas (American Samoa, the Commonwealth of the Northern Mariana Islands, Guam, and the U.S. Virgin Islands) as the statistical equivalents of States for the purpose of data presentation. + + In order for others to use the information in the Census MAF/TIGER database in a geographic information system (GIS) or for other geographic applications, the Census Bureau releases to the public extracts of the database in the form of TIGER/Line Shapefiles. + + + + + 201506 + 201605 + + + Publication Date + + + Complete + No changes or updates will be made to this version of the TIGER/Line Shapefiles. Future releases of TIGER/Line Shapefiles will reflect updates made to the Census MAF/TIGER database. + + + + -179.231086 + 179.859681 + 71.441059 + -14.601813 + + + + + NGDA Portfolio Themes + NGDA + Governmental Units and Administrative and Statistical Boundaries Theme + National Geospatial Data Asset + + + None + Nation + Polygon + + + ISO 19115 Topic Categories + Boundaries + + + ANSI INCITS 38:2009 (Formerly FIPS 5-2), + ANSI INCITS 31:2009 (Formerly FIPS 6-4),ANSI + INCITS 454:2009 (Formerly FIPS 8-6), ANSI INCITS + 455:2009(Formerly FIPS 9-1), ANSI INCITS 446:2008 (Geographic Names Information System (GNIS)) + + +United States + + +U.S. + + + + None + The TIGER/Line Shapefile products are not copyrighted however TIGER/Line and Census TIGER are registered trademarks of the U.S. Census Bureau. These products are free to use in a product or publication, however acknowledgement must be given to the U.S. Census Bureau as the source. +The boundary information in the TIGER/Line Shapefiles are for statistical data collection and tabulation purposes only; their depiction and designation for statistical purposes does not constitute a determination of jurisdictional authority or rights of ownership or entitlement and they are not legal land descriptions.Coordinates in the TIGER/Line shapefiles have six implied decimal places, but the positional accuracy of these coordinates is not as great as the six decimal places suggest. + + + + U.S. Department of Commerce, U.S. Census Bureau, + Geography Division, Spatial Data Collection and Products Branch + + + mailing +
4600 Silver Hill Road, Stop 7400
+ Washington + DC + 20233-7400 + United States +
+ 301-763-1128 + 301-763-4710 + geo.geography@census.gov +
+
+
+ + + Accurate against National Standard Codes, Federal Information Processing (FIPS) and the Geographic Names Information System (GNIS) at the 100% level for the codes and base names. The remaining attribute information has been examined but has not been fully tested for accuracy. + + The Census Bureau performed automated tests to ensure logical consistency and limits of shapefiles. Segments making up the outer and inner boundaries of a polygon tie end-to-end to completely enclose the area. All polygons are tested for closure. +The Census Bureau uses its internally developed geographic update system to enhance and modify spatial and attribute data in the Census MAF/TIGER database. Standard geographic codes, such as FIPS codes for states, counties, municipalities, county subdivisions, places, American Indian/Alaska Native/Native Hawaiian areas, and congressional districts are used when encoding spatial entities. The Census Bureau performed spatial data tests for logical consistency of the codes during the compilation of the original Census MAF/TIGER database files. Most of the codes for geographic entities except states, counties, urban areas, Core Based Statistical Areas (CBSAs), American Indian Areas (AIAs), and congressional districts were provided to the Census Bureau by the USGS, the agency responsible for maintaining the Geographic Names Information System (GNIS). Feature attribute information has been examined but has not been fully tested for consistency. +For the TIGER/Line Shapefiles, the Point and Vector Object Count for the G-polygon SDTS Point and Vector Object Type reflects the number of records in the shapefile attribute table. For multi-polygon features, only one attribute record exists for each multi-polygon rather than one attribute record per individual G-polygon component of the multi-polygon feature. TIGER/Line Shapefile multi-polygons are an exception to the G-polygon object type classification. Therefore, when multi-polygons exist in a shapefile, the object count will be less than the actual number of G-polygons. + Data completeness of the TIGER/Line Shapefiles reflects the contents of the Census MAF/TIGER database at the time the TIGER/Line Shapefiles were created. + + + + + U.S. Department of Commerce, U.S. Census Bureau, Geography Division + Unpublished material + Census MAF/TIGER database + + + online + + + + 201506 + 201605 + + + Publication Date + + MAF/TIGER + All line segments + + + TIGER/Line Shapefiles are extracted from the Census MAF/TIGER database by nation, state, county, and entity. Census MAF/TIGER data for all of the aforementioned geographic entities are then distributed among the shapefiles each containing attributes for line, polygon, or landmark geographic data. + MAF/TIGER + 2016 + + + + + Federal Information Processing Series (FIPS), Geographic Names Information System (GNIS), and feature names. + Vector + + + G-polygon + 56 + + + + + + + 0.000458 + 0.000458 + Decimal degrees + + + North American Datum of 1983 + Geodetic Reference System 80 + 6378137 + 298.257 + + + + + + + STATE.shp + Current State and Equivalent National entities + U.S. Census Bureau + + + REGION + Current Census region code + U.S. Census Bureau + + + 1 + Northeast + U.S. Census Bureau + + + + + 2 + Midwest + U.S. Census Bureau + + + + + 3 + South + U.S. Census Bureau + + + + + 4 + West + U.S. Census Bureau + + + + + 9 + Puerto Rico and the Island Areas + U.S. Census Bureau + + + + + DIVISION + Current Census division code + U.S. Census Bureau + + + 0 + Puerto Rico and the Island Areas + U.S. Census Bureau + + + + + 1 + New England + U.S. Census Bureau + + + + + 2 + Middle Atlantic + U.S. Census Bureau + + + + + 3 + East North Central + U.S. Census Bureau + + + + + 4 + West North Central + U.S. Census Bureau + + + + + 5 + South Atlantic + U.S. Census Bureau + + + + + 6 + East South Central + U.S. Census Bureau + + + + + 7 + West South Central + U.S. Census Bureau + + + + + 8 + Mountain + U.S. Census Bureau + + + + + 9 + Pacific + U.S. Census Bureau + + + + + STATEFP + Current state Federal Information Processing Series (FIPS) code + U.S. Census Bureau + + + National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing Series (FIPS) - States/State Equivalents + U.S. Census Bureau + + + + + STATENS + Current state GNIS code + U.S. Census Bureau + + + INCITS 446:2008 (Geographic Names Information System (GNIS)), Identifying Attributes for Named Physical and Cultural Geographic Features (Except Roads and Highways) of the United States, Its Territories, Outlying Areas, and Freely Associated Areas, and the Waters of the Same to the Limit of the Twelve-Mile Statutory Zone + U.S. Geological Survey (USGS) + + + + + GEOID + State identifier; state FIPS code + U.S. Census Bureau + + + National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing + U.S. Census Bureau + + + + + STUSPS + Current United States Postal Service state abbreviation + U.S. Postal Service + + + Publication 28 - Postal Addressing Standards + U.S. Postal Service + + + + + NAME + Current state name + U.S. Census Bureau + + + National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing Series (FIPS) - States/State Equivalents + U.S. Census Bureau + + + + + LSAD + Current legal/statistical area description code for state + U.S. Census Bureau + + + 00 + Blank + U.S. Census Bureau + + + + + MTFCC + MAF/TIGER feature class code + U.S. Census Bureau + + + G4000 + State or Equivalent feature + U.S. Census Bureau + + + + + FUNCSTAT + Current functional status + U.S. Census Bureau + + + A + Active government providing primary general-purpose functions + U.S. Census Bureau + + + + + ALAND + Current land area (square meters) + U.S. Census Bureau + + + 0 + 9,999,999,999,999 + + + + + AWATER + Current water area (square meters) + U.S. Census Bureau + + + 0 + 9,999,999,999,999 + + + + + INTPTLAT + Current latitude of the internal point + U.S. Census Bureau + + + -90.000000 + 90.000000 + + + + + INTPTLON + Current longitude of the internal point + U.S. Census Bureau + + + -180.000000 + 180.000000 + + + + + + + + + + U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Spatial Data Collection and Products Branch + + + mailing +
4600 Silver Hill Road, Stop 7400
+ Washington + DC + 20233-7400 + United States +
+ 301-763-1128 + 301-763-4710 + geo.geography@census.gov +
+
+ No warranty, expressed or implied is made with regard to the accuracy of these data, and no liability is assumed by the U.S. Government in general or the U.S. Census Bureau in specific as to the spatial or attribute accuracy of the data. The act of distribution shall not constitute any such warranty and no responsibility is assumed by the U.S. government in the use of these files. The boundary information in the TIGER/Line Shapefiles is for statistical data collection and tabulation purposes only; their depiction and designation for statistical purposes do not constitute a determination of jurisdictional authority or rights of ownership or entitlement and they are not legal land descriptions. + + + + TGRSHP (compressed) + PK-ZIP, version 1.93 A or higher + + + + + + http://www2.census.gov/geo/tiger/TIGER2016/STATE/tl_2016_us_state.zip + + + + + + The online copy of the TIGER/Line Shapefiles may be accessed without charge. + To obtain more information about ordering TIGER/Line Shapefiles visit http://www.census.gov/geo/www/tiger + + The TIGER/Line shapefiles contain geographic data only and do not include display mapping software or statistical data. For information on how to use the TIGER/Line shapefile data with specific software package users shall contact the company that produced the software. +
+ + 20160601 + + + + U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Spatial Data Collection and Products Branch + + + mailing +
4600 Silver Hill Road, Stop 7400
+ Washington + DC + 20233-7400 + United States +
+ 301-763-1128 + 301-763-4710 + geo.geography@census.gov +
+
+ FGDC Content Standards for Digital Geospatial Metadata + FGDC-STD-001-1998 +
+
\ No newline at end of file diff --git a/src/test/resources/shapefiles/us_states/tl_2016_us_state.shx b/src/test/resources/shapefiles/us_states/tl_2016_us_state.shx new file mode 100755 index 0000000..ea62b07 Binary files /dev/null and b/src/test/resources/shapefiles/us_states/tl_2016_us_state.shx differ diff --git a/src/test/scala/magellan/ShapefileSuite.scala b/src/test/scala/magellan/ShapefileSuite.scala index 71a89b7..1955119 100644 --- a/src/test/scala/magellan/ShapefileSuite.scala +++ b/src/test/scala/magellan/ShapefileSuite.scala @@ -17,12 +17,17 @@ package magellan import magellan.TestingUtils._ -import magellan.index.ZOrderCurve +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.magellan.dsl.expressions._ -import org.scalatest.FunSuite +import org.scalatest.{BeforeAndAfterAll, FunSuite} -class ShapefileSuite extends FunSuite with TestSparkContext { +class ShapefileSuite extends FunSuite with TestSparkContext with BeforeAndAfterAll { + override def beforeAll() { + super.beforeAll() + sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.split.maxsize", "10000") + } test("shapefile-relation: points") { val sqlCtx = this.sqlContext @@ -105,9 +110,16 @@ class ShapefileSuite extends FunSuite with TestSparkContext { option("magellan.index", "true"). option("magellan.index.precision", "15"). load(path) - import sqlCtx.implicits._ import org.apache.spark.sql.functions.explode + import sqlCtx.implicits._ assert(df.select(explode($"index")).count() === 2) assert(df.select(explode($"index").as("index")).groupBy($"index.relation").count().count() === 1) } + + test("shapefile-relation: use shx file to split") { + val sqlCtx = this.sqlContext + val path = this.getClass.getClassLoader.getResource("shapefiles/us_states/").getPath + val df = sqlCtx.read.format("magellan").load(path) + assert(df.count() === 56) + } } diff --git a/src/test/scala/magellan/mapreduce/ShxReaderSuite.scala b/src/test/scala/magellan/mapreduce/ShxReaderSuite.scala new file mode 100644 index 0000000..16419ab --- /dev/null +++ b/src/test/scala/magellan/mapreduce/ShxReaderSuite.scala @@ -0,0 +1,92 @@ +/** + * Copyright 2015 Ram Sriharsha + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package magellan.mapreduce + +import magellan.TestSparkContext +import magellan.io.PolygonReader +import org.apache.commons.io.EndianUtils +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.io.{ArrayWritable, LongWritable, Text} +import org.scalatest.FunSuite + +class ShxReaderSuite extends FunSuite with TestSparkContext { + + test("Read shx file") { + val path = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shx").getPath + val conf = new Configuration() + conf.set("mapreduce.input.fileinputformat.split.maxsize", "10000") + + val data = sc.newAPIHadoopFile( + path, + classOf[ShxInputFormat], + classOf[Text], + classOf[ArrayWritable], + conf + ).map { case (txt: Text, splits: ArrayWritable) => + val fileName = txt.toString + val s = splits.get() + val size = s.length + var i = 0 + val v = Array.fill(size)(0L) + while (i < size) { + v.update(i, s(i).asInstanceOf[LongWritable].get()) + i += 1 + } + (fileName, v) + } + assert(data.count() === 1) + val (fileName, splits) = data.first() + assert(fileName === "tl_2016_us_state") + + // the offsets should be correct + val firstOffset = splits(0) + val secondOffset = splits(1) + + // skipping to the first offset in the Shapefile should allow me to read the first polygon + val shpFilePath = this.getClass.getClassLoader.getResource("shapefiles/us_states/tl_2016_us_state.shp").getPath + + val fs = FileSystem.get(sc.hadoopConfiguration) + + var dis = fs.open(new Path(shpFilePath)) + + // skip firstOffset # of bytes + dis.seek(firstOffset) + + // skip record number + assert(dis.readInt() === 1) + + // read content length + var contentLength = 16 * (dis.readInt() + 4) + + // extract the shape type + var shapeType = EndianUtils.swapInteger(dis.readInt()) + + // expect a Polygon + assert(shapeType === 5) + + // the first polygon's content should follow from here + val polygonReader = new PolygonReader() + val polygon = polygonReader.readFields(dis) + assert(polygon != null) + + // seek to the second offset + dis.seek(secondOffset) + assert(dis.readInt() === 2) + + } +}