From 9e357cca1c66566a8446d9b2f1692025e5494777 Mon Sep 17 00:00:00 2001 From: Nick Ruest Date: Sat, 18 Jan 2020 14:26:58 -0500 Subject: [PATCH] Add crawl_date to binary DataFrames and imageLinks. (#414) - Resolves #413 - Update tests where necessary --- .../scala/io/archivesunleashed/package.scala | 48 +++++++++++-------- .../df/DataFrameLoaderTest.scala | 5 +- 2 files changed, 32 insertions(+), 21 deletions(-) diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index 90d982ac..fff79edd 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -363,7 +363,7 @@ package object archivesunleashed { val records = rdd .keepValidPages() .flatMap(r => ExtractLinksRDD(r.getUrl, r.getContentString) - .map(t => (r.getCrawlDate, t._1, t._2, t._3))) + .map(t => (r.getCrawlDate, t._1, t._2, t._3))) .filter(t => t._2 != "" && t._3 != "") .map(t => Row(t._1, t._2, t._3, t._4)) @@ -381,14 +381,16 @@ package object archivesunleashed { def imageLinks(): DataFrame = { val records = rdd .keepValidPages() - .flatMap(r => { + .flatMap(r => ({ val src = r.getUrl val imageUrls = ExtractImageLinksRDD(src, r.getContentString) imageUrls.map(url => (src, url)) }) - .map(t => Row(t._1, t._2)) + .map(t => (r.getCrawlDate, t._1, t._2))) + .map(t => Row(t._1, t._2, t._3)) val schema = new StructType() + .add(StructField("crawl_date", StringType, true)) .add(StructField("src", StringType, true)) .add(StructField("image_url", StringType, true)) @@ -406,12 +408,13 @@ package object archivesunleashed { val url = new URL(r.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMimeRDD(url.getPath(), mimeTypeTika) - (r.getUrl, filename, extension, r.getMimeType, mimeTypeTika, + (r.getCrawlDate, r.getUrl, filename, extension, r.getMimeType, mimeTypeTika, image.width, image.height, image.md5Hash, image.sha1Hash, image.body) }) - .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)) + .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10, t._11)) val schema = new StructType() + .add(StructField("crawl_date", StringType, true)) .add(StructField("url", StringType, true)) .add(StructField("filename", StringType, true)) .add(StructField("extension", StringType, true)) @@ -442,12 +445,13 @@ package object archivesunleashed { val url = new URL(r._1.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMimeRDD(url.getPath(), r._2) - (r._1.getUrl, filename, extension, r._1.getMimeType, + (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) }) - .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8)) + .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) val schema = new StructType() + .add(StructField("crawl_date", StringType, true)) .add(StructField("url", StringType, true)) .add(StructField("filename", StringType, true)) .add(StructField("extension", StringType, true)) @@ -476,12 +480,13 @@ package object archivesunleashed { val url = new URL(r._1.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMimeRDD(url.getPath(), r._2) - (r._1.getUrl, filename, extension, r._1.getMimeType, + (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) }) - .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8)) + .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) val schema = new StructType() + .add(StructField("crawl_date", StringType, true)) .add(StructField("url", StringType, true)) .add(StructField("filename", StringType, true)) .add(StructField("extension", StringType, true)) @@ -510,12 +515,13 @@ package object archivesunleashed { val url = new URL(r._1.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMimeRDD(url.getPath(), r._2) - (r._1.getUrl, filename, extension, r._1.getMimeType, + (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) }) - .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8)) + .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) val schema = new StructType() + .add(StructField("crawl_date", StringType, true)) .add(StructField("url", StringType, true)) .add(StructField("filename", StringType, true)) .add(StructField("extension", StringType, true)) @@ -577,12 +583,13 @@ package object archivesunleashed { } } val extension = GetExtensionMimeRDD(url.getPath(), mimeType) - (r._1.getUrl, filename, extension, r._1.getMimeType, + (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) }) - .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8)) + .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) val schema = new StructType() + .add(StructField("crawl_date", StringType, true)) .add(StructField("url", StringType, true)) .add(StructField("filename", StringType, true)) .add(StructField("extension", StringType, true)) @@ -623,12 +630,13 @@ package object archivesunleashed { val url = new URL(r._1.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMimeRDD(url.getPath(), r._2) - (r._1.getUrl, filename, extension, r._1.getMimeType, + (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) }) - .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8)) + .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) val schema = new StructType() + .add(StructField("crawl_date", StringType, true)) .add(StructField("url", StringType, true)) .add(StructField("filename", StringType, true)) .add(StructField("extension", StringType, true)) @@ -674,12 +682,13 @@ package object archivesunleashed { val url = new URL(r._1.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMimeRDD(url.getPath(), r._2) - (r._1.getUrl, filename, extension, r._1.getMimeType, + (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) }) - .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8)) + .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) val schema = new StructType() + .add(StructField("crawl_date", StringType, true)) .add(StructField("url", StringType, true)) .add(StructField("filename", StringType, true)) .add(StructField("extension", StringType, true)) @@ -714,12 +723,13 @@ package object archivesunleashed { val url = new URL(r.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = FilenameUtils.getExtension(url.getPath()) - (r.getUrl, filename, extension, r.getMimeType, + (r.getCrawlDate, r.getUrl, filename, extension, r.getMimeType, DetectMimeTypeTika(r.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) }) - .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8)) + .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) val schema = new StructType() + .add(StructField("crawl_date", StringType, true)) .add(StructField("url", StringType, true)) .add(StructField("filename", StringType, true)) .add(StructField("extension", StringType, true)) diff --git a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala index fea9db6a..3ca90546 100644 --- a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala +++ b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala @@ -68,8 +68,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { assert(r_2(1) == "Advanced Search") val r_3 = imageLinks.take(100)(99) - assert(r_3.get(0) == "http://www.archive.org/details/secretarmiesb00spivrich") - assert(r_3.get(1) == "http://www.archive.org/images/star.png") + assert(r_3.get(0) == "20080430") + assert(r_3.get(1) == "http://www.archive.org/details/secretarmiesb00spivrich") + assert(r_3.get(2) == "http://www.archive.org/images/star.png") val r_4 = images.take(1)(0) assert(r_4.getAs[String](url) == "http://www.archive.org/images/logoc.jpg")