\n")
+ outFile.write(
+ "\n" +
+ "\n" +
+ "\n" +
+ "\n" +
+ " \n" +
+ "\n" +
+ "\n"
+ )
vertices foreach { v =>
- outFile.write("\n\n")
data foreach { e =>
- outFile.write("\n" +
- "\n" +
- "\n" +
- "\n")
+ outFile.write(
+ "\n" +
+ "\n" +
+ "\n" +
+ "\n"
+ )
}
outFile.write("\n\n")
outFile.close()
diff --git a/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala
index 8686ed65..a4cd2d22 100644
--- a/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala
+++ b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala
@@ -21,29 +21,31 @@ import java.nio.file.{Files, Paths}
import org.apache.spark.sql.Row
object WriteGraphML {
+
/** Verifies graphmlPath is empty.
- *
- * @param data Array[Row] elements in format (crawl_date, src_domain,
- * dest_domain, count)
- * @param graphmlPath output file
- */
+ *
+ * @param data Array[Row] elements in format (crawl_date, src_domain,
+ * dest_domain, count)
+ * @param graphmlPath output file
+ */
def apply(data: Array[Row], graphmlPath: String): Boolean = {
if (graphmlPath.isEmpty()) {
false
} else {
- makeFile (data, graphmlPath)
+ makeFile(data, graphmlPath)
}
}
/** Produces the GraphML output from an Array[Row] and outputs it to graphmlPath.
- *
- * @param data a Dataset[Row] of elements in format (crawl_date, src_domain,
- * dest_domain, count)
- * @param graphmlPath output file
- * @return true on success.
- */
+ *
+ * @param data a Dataset[Row] of elements in format (crawl_date, src_domain,
+ * dest_domain, count)
+ * @param graphmlPath output file
+ * @return true on success.
+ */
def makeFile(data: Array[Row], graphmlPath: String): Boolean = {
- val outFile = Files.newBufferedWriter(Paths.get(graphmlPath), StandardCharsets.UTF_8)
+ val outFile =
+ Files.newBufferedWriter(Paths.get(graphmlPath), StandardCharsets.UTF_8)
val nodes = scala.collection.mutable.Set[String]()
data foreach { d =>
@@ -51,32 +53,46 @@ object WriteGraphML {
nodes.add(d.get(2).asInstanceOf[String])
}
- outFile.write("\n" +
- "" +
- "\n" +
- "\n" +
- "0.0\n" +
- "\n" +
- "\n" +
- "\n")
+ outFile.write(
+ "\n" +
+ "" +
+ "\n" +
+ "\n" +
+ "0.0\n" +
+ "\n" +
+ "\n" +
+ "\n"
+ )
nodes foreach { n =>
- outFile.write("\n" +
- "" + n.asInstanceOf[String].escapeInvalidXML() + "\n\n")
+ outFile.write(
+ "\n" +
+ "" + n
+ .asInstanceOf[String]
+ .escapeInvalidXML() + "\n\n"
+ )
}
data foreach { e =>
- outFile.write("\n" +
- "" + e.get(3) + "\n" +
- "" + e.get(0) + "\n" +
- "\n")
+ outFile.write(
+ "\n" +
+ "" + e.get(3) + "\n" +
+ "" + e.get(0) + "\n" +
+ "\n"
+ )
}
- outFile.write("\n" +
- "")
+ outFile.write(
+ "\n" +
+ ""
+ )
outFile.close()
true
}
diff --git a/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala b/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala
index 53e44bb9..57cdaa15 100644
--- a/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala
+++ b/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala
@@ -20,73 +20,72 @@ import io.archivesunleashed.RecordLoader
import org.apache.spark.SparkContext
import org.apache.spark.sql.DataFrame
-/** DataFrame wrapper for PySpark implementation. **/
+/** DataFrame wrapper for PySpark implementation. * */
class DataFrameLoader(sc: SparkContext) {
/** Create a DataFrame with crawl_date, url, mime_type_web_server, mime_type_tika, content, bytes, http_status_code, and archive_filename. */
def all(path: String): DataFrame = {
- RecordLoader.loadArchives(path, sc)
+ RecordLoader
+ .loadArchives(path, sc)
.keepValidPages()
.all()
}
/** Create a DataFrame with audio url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def audio(path: String): DataFrame = {
- RecordLoader.loadArchives(path, sc)
- .audio
+ RecordLoader.loadArchives(path, sc).audio
}
/* Create a DataFrame with crawl date, source page, image url, and alt text. */
def imagegraph(path: String): DataFrame = {
- RecordLoader.loadArchives(path, sc)
+ RecordLoader
+ .loadArchives(path, sc)
.imagegraph()
}
/** Create a DataFrame with image url, filename, extension, mime_type_web_server, mime_type_tika, width, height, md5, sha1, and raw bytes. */
def images(path: String): DataFrame = {
- RecordLoader.loadArchives(path, sc)
+ RecordLoader
+ .loadArchives(path, sc)
.images()
}
/** Create a DataFrame with PDF url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def pdfs(path: String): DataFrame = {
- RecordLoader.loadArchives(path, sc)
- .pdfs
+ RecordLoader.loadArchives(path, sc).pdfs
}
/** Create a DataFrame with presentation program file url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def presentationProgramFiles(path: String): DataFrame = {
- RecordLoader.loadArchives(path, sc)
- .presentationProgramFiles
+ RecordLoader.loadArchives(path, sc).presentationProgramFiles
}
/** Create a DataFrame with spreadsheet url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def spreadsheets(path: String): DataFrame = {
- RecordLoader.loadArchives(path, sc)
- .spreadsheets
+ RecordLoader.loadArchives(path, sc).spreadsheets
}
/** Create a DataFrame with video url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def videos(path: String): DataFrame = {
- RecordLoader.loadArchives(path, sc)
- .videos
+ RecordLoader.loadArchives(path, sc).videos
}
/** Create a DataFrame with crawl_date, source, destination, and anchor. */
def webgraph(path: String): DataFrame = {
- RecordLoader.loadArchives(path, sc)
+ RecordLoader
+ .loadArchives(path, sc)
.webgraph()
}
/** Create a DataFrame with crawl_date, url, mime_type_web_server, language, and content. */
def webpages(path: String): DataFrame = {
- RecordLoader.loadArchives(path, sc)
+ RecordLoader
+ .loadArchives(path, sc)
.webpages()
}
/** Create a DataFrame with word processor file url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
def wordProcessorFiles(path: String): DataFrame = {
- RecordLoader.loadArchives(path, sc)
- .wordProcessorFiles
+ RecordLoader.loadArchives(path, sc).wordProcessorFiles
}
- }
+}
diff --git a/src/main/scala/io/archivesunleashed/df/package.scala b/src/main/scala/io/archivesunleashed/df/package.scala
index 677711d8..727b442d 100644
--- a/src/main/scala/io/archivesunleashed/df/package.scala
+++ b/src/main/scala/io/archivesunleashed/df/package.scala
@@ -25,9 +25,9 @@ import org.apache.spark.sql.DataFrame
package object df {
/**
- * Given a dataframe, serializes binary object and saves to disk
- * @param df the input dataframe
- */
+ * Given a dataframe, serializes binary object and saves to disk
+ * @param df the input dataframe
+ */
implicit class SaveBytes(df: DataFrame) {
/**
@@ -36,24 +36,30 @@ package object df {
* @param extensionColumnName the name of the column containin the extension
* e.g. fileName = "foo" => files are saved as "foo-[MD5 hash].pdf"
*/
- def saveToDisk(bytesColumnName: String, fileName: String, extensionColumnName: String): Unit = {
- df.select(bytesColumnName, extensionColumnName).foreach(row => {
- try {
- // Assumes the bytes are base64 encoded.
- val encodedBytes: String = row.getAs(bytesColumnName);
- val bytes = Base64.getDecoder.decode(encodedBytes);
- val in = new ByteArrayInputStream(bytes);
+ def saveToDisk(
+ bytesColumnName: String,
+ fileName: String,
+ extensionColumnName: String
+ ): Unit = {
+ df.select(bytesColumnName, extensionColumnName)
+ .foreach(row => {
+ try {
+ // Assumes the bytes are base64 encoded.
+ val encodedBytes: String = row.getAs(bytesColumnName);
+ val bytes = Base64.getDecoder.decode(encodedBytes);
+ val in = new ByteArrayInputStream(bytes);
- val extension: String = row.getAs(extensionColumnName);
- val suffix = ComputeMD5(bytes)
- val file = new FileOutputStream(fileName + "-" + suffix + "." + extension.toLowerCase)
- IOUtils.copy(in, file)
- file.close()
- } catch {
- case e: Throwable => {
+ val extension: String = row.getAs(extensionColumnName);
+ val suffix = ComputeMD5(bytes)
+ val file = new FileOutputStream(
+ fileName + "-" + suffix + "." + extension.toLowerCase
+ )
+ IOUtils.copy(in, file)
+ file.close()
+ } catch {
+ case e: Throwable => {}
}
- }
- })
+ })
}
}
}
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala b/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala
index 15e2a94b..bbf8d8eb 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala
@@ -22,13 +22,13 @@ import javax.imageio.ImageIO
object ComputeImageSize {
/** Computes image size from a byte array using ImageIO.
- *
- * Used by `ExtractPopularImages` to calculate the size of
- * the image as a tuple of integers (width, height).
- *
- * @param bytes image as a byte array
- * @return size of image as a tuple (width, height) or (0,0).
- */
+ *
+ * Used by `ExtractPopularImages` to calculate the size of
+ * the image as a tuple of integers (width, height).
+ *
+ * @param bytes image as a byte array
+ * @return size of image as a tuple (width, height) or (0,0).
+ */
def apply(bytes: Array[Byte]): (Int, Int) = {
val nullImage = (0, 0)
try {
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala b/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala
index ded60bb1..fda08e79 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala
@@ -19,12 +19,17 @@ import java.security.MessageDigest
/** Compute MD5 checksum. */
object ComputeMD5 {
+
/** Computes the MD5 checksum of a byte array (eg. an image).
*
* @param bytes
* @return MD5 checksum.
*/
def apply(bytes: Array[Byte]): String = {
- MessageDigest.getInstance("MD5").digest(bytes).map("%02x".format(_)).mkString
+ MessageDigest
+ .getInstance("MD5")
+ .digest(bytes)
+ .map("%02x".format(_))
+ .mkString
}
}
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ComputeSHA1.scala b/src/main/scala/io/archivesunleashed/matchbox/ComputeSHA1.scala
index bf454971..2ec165ba 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ComputeSHA1.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ComputeSHA1.scala
@@ -27,6 +27,10 @@ object ComputeSHA1 {
* @return SHA1 checksum.
*/
def apply(bytes: Array[Byte]): String = {
- MessageDigest.getInstance("SHA1").digest(bytes).map("%02x".format(_)).mkString
+ MessageDigest
+ .getInstance("SHA1")
+ .digest(bytes)
+ .map("%02x".format(_))
+ .mkString
}
}
diff --git a/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala b/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala
index 3934513f..da4c14de 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala
@@ -23,16 +23,16 @@ import org.apache.tika.language.detect.LanguageResult;
object DetectLanguage {
/** Detects the language of a String input.
- *
- * @param input the string for which language can be detected
- * @return ISO 639-2 language code (eg. "en", "fr" or "it").
- */
+ *
+ * @param input the string for which language can be detected
+ * @return ISO 639-2 language code (eg. "en", "fr" or "it").
+ */
def apply(input: String): String = {
if (input.isEmpty) {
""
} else {
val detector: LanguageDetector = new OptimaizeLangDetector().loadModels()
- val result : LanguageResult = detector.detect(input)
+ val result: LanguageResult = detector.detect(input)
result.getLanguage()
}
}
diff --git a/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala b/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala
index aff45496..06c47365 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala
@@ -33,10 +33,10 @@ object DetectMimeTypeTika {
val allMimeTypes = MimeTypes.getDefaultMimeTypes();
/** Detect MIME type from an input string.
- *
- * @param content a byte array of content for which to detect the MimeType
- * @return MIME type (e.g. "text/html" or "application/xml") or "N/A".
- */
+ *
+ * @param content a byte array of content for which to detect the MimeType
+ * @return MIME type (e.g. "text/html" or "application/xml") or "N/A".
+ */
def apply(content: Array[Byte]): String = {
if (content.size == 0) {
"N/A"
@@ -49,20 +49,20 @@ object DetectMimeTypeTika {
}
/** Return the best guess at a file extension from a MIME type string
- *
- * @param mimeType string representation of the MimeType
- * @return file extension (e.g. ".jpg" for "image/jpeg").
- */
+ *
+ * @param mimeType string representation of the MimeType
+ * @return file extension (e.g. ".jpg" for "image/jpeg").
+ */
def getExtension(mimeType: String): String = {
val regMimeType = allMimeTypes.forName(mimeType)
regMimeType.getExtension
}
/** Return the list of all known file extensions for a MIME type string
- *
- * @param mimeType string representation of the MimeType
- * @return list of file extensions (e.g. ".jpg" for "image/jpeg").
- */
+ *
+ * @param mimeType string representation of the MimeType
+ * @return list of file extensions (e.g. ".jpg" for "image/jpeg").
+ */
def getExtensions(mimeType: String): List[String] = {
val regMimeType = allMimeTypes.forName(mimeType)
regMimeType.getExtensions.asScala.toList
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala
index 2924a102..77b681d1 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala
@@ -18,23 +18,28 @@ package io.archivesunleashed.matchbox
import de.l3s.boilerpipe.extractors.DefaultExtractor
import java.io.IOException
-/** Extract raw text content from an HTML page, minus "boilerplate" content (using boilerpipe). */
+/** Extract raw text content from an HTML page, minus "boilerplate" content (using boilerpipe). */
object ExtractBoilerpipeText {
+
/** Uses boilerpipe to extract raw text content from a page.
- *
- * ExtractBoilerpipeText removes boilerplate text (e.g. a copyright statement) from an HTML string.
- *
- * @param input an html string possibly containing boilerpipe text
- * @return text with boilerplate removed or Nil if the text is empty.
- */
+ *
+ * ExtractBoilerpipeText removes boilerplate text (e.g. a copyright statement) from an HTML string.
+ *
+ * @param input an html string possibly containing boilerpipe text
+ * @return text with boilerplate removed or Nil if the text is empty.
+ */
def apply(input: String): String = {
removeBoilerplate(RemoveHTTPHeader(input))
}
private def removeBoilerplate(input: String): String = {
- val maybeInput = Option(DefaultExtractor.INSTANCE
- .getText(input).replaceAll("[\\r\\n]+", " ").trim())
+ val maybeInput = Option(
+ DefaultExtractor.INSTANCE
+ .getText(input)
+ .replaceAll("[\\r\\n]+", " ")
+ .trim()
+ )
maybeInput match {
case Some(text) =>
text
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
index ac697478..4dc44efb 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala
@@ -18,6 +18,7 @@ package io.archivesunleashed.matchbox
/** Gets different parts of a dateString. */
object ExtractDate {
object DateComponent extends Enumeration {
+
/** An enum specifying years, months, days or a combination. */
type DateComponent = Value
val YYYY, MM, DD, YYYYMM, YYYYMMDD = Value
@@ -39,11 +40,11 @@ object ExtractDate {
maybeFullDate match {
case Some(fulldate) =>
dateFormat match {
- case YYYY => fullDate.substring(startSS, yearSS)
- case MM => fullDate.substring(yearSS, monthSS)
- case DD => fullDate.substring(monthSS, daySS)
+ case YYYY => fullDate.substring(startSS, yearSS)
+ case MM => fullDate.substring(yearSS, monthSS)
+ case DD => fullDate.substring(monthSS, daySS)
case YYYYMM => fullDate.substring(startSS, monthSS)
- case _ => fullDate.substring(startSS, daySS)
+ case _ => fullDate.substring(startSS, daySS)
}
case None =>
""
@@ -64,11 +65,11 @@ object ExtractDate {
maybeFullDate match {
case Some(fulldate) =>
dateFormat match {
- case "YYYY" => fullDate.substring(startSS, yearSS)
- case "MM" => fullDate.substring(yearSS, monthSS)
- case "DD" => fullDate.substring(monthSS, daySS)
+ case "YYYY" => fullDate.substring(startSS, yearSS)
+ case "MM" => fullDate.substring(yearSS, monthSS)
+ case "DD" => fullDate.substring(monthSS, daySS)
case "YYYYMM" => fullDate.substring(startSS, monthSS)
- case _ => fullDate.substring(startSS, daySS)
+ case _ => fullDate.substring(startSS, daySS)
}
case None =>
""
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
index c0aef0cd..c609c837 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
@@ -19,12 +19,13 @@ import java.net.URL
/** Extracts the host domain name from a full url string. */
object ExtractDomain {
+
/** Extract source domains from a full url string.
- *
- * @param url a url as a string
- * @param source an optional default url for urls with no valid domain host
- * @return domain host, source or null if url is null.
- */
+ *
+ * @param url a url as a string
+ * @param source an optional default url for urls with no valid domain host
+ * @return domain host, source or null if url is null.
+ */
def apply(url: String, source: String = ""): String = {
val maybeHost: Option[URL] = checkUrl(url)
val maybeSource: Option[URL] = checkUrl(source)
@@ -38,7 +39,7 @@ object ExtractDomain {
source.getHost
case None =>
""
- }
+ }
}
}
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala
index b735c848..3bd05222 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala
@@ -26,8 +26,12 @@ class ImageDetails(imageUrl: String, imageType: String, bytes: Array[Byte]) {
val height = dimensions._2
val url: String = imageUrl
val mimeType: String = imageType
- val md5Hash: String = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
- val sha1Hash: String = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
+ val md5Hash: String = new String(
+ Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+ )
+ val sha1Hash: String = new String(
+ Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+ )
val body: String = Base64.getEncoder.encodeToString(bytes)
}
@@ -35,9 +39,9 @@ class ImageDetails(imageUrl: String, imageType: String, bytes: Array[Byte]) {
object ExtractImageDetails {
/**
- * @param bytes the raw bytes of the image
- * @return A tuple containing the width and height of the image
- */
+ * @param bytes the raw bytes of the image
+ * @return A tuple containing the width and height of the image
+ */
def apply(url: String, mimeType: String, bytes: Array[Byte]): ImageDetails = {
new ImageDetails(url, mimeType, bytes)
}
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala
index e96be01a..015e088b 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala
@@ -31,14 +31,18 @@ object ExtractLinks {
* @param base an optional base URI
* @return a sequence of (source, target, anchortext).
*/
- def apply(src: String, html: String, base: String = ""): Seq[(String, String, String)] = {
+ def apply(
+ src: String,
+ html: String,
+ base: String = ""
+ ): Seq[(String, String, String)] = {
val srcMaybe: Option[String] = Option(src)
val htmlMaybe: Option[String] = Option(html)
val output = mutable.MutableList[(String, String, String)]()
srcMaybe match {
case Some(valid_src) =>
htmlMaybe match {
- case Some (valid_html) =>
+ case Some(valid_html) =>
val doc = Jsoup.parse(valid_html)
val links: Elements = doc.select("a[href]")
val it = links.iterator()
@@ -51,11 +55,11 @@ object ExtractLinks {
}
}
case None =>
- // do nothing
- }
+ // do nothing
+ }
case None =>
- // do nothing
- }
+ // do nothing
+ }
output
}
}
diff --git a/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala
index 3903e17d..49f061d0 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala
@@ -22,10 +22,10 @@ import org.jsoup.Jsoup
object RemoveHTML {
/** Removes HTML markup.
- *
- * @param content an html or text string
- * @return content without html markup.
- */
+ *
+ * @param content an html or text string
+ * @return content without html markup.
+ */
def apply(content: String): String = {
// First remove the HTTP header.
val maybeContent: Option[String] = Option(RemoveHTTPHeader(content))
diff --git a/src/main/scala/io/archivesunleashed/matchbox/RemoveHTTPHeader.scala b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTTPHeader.scala
index e074698d..80b54b7e 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/RemoveHTTPHeader.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTTPHeader.scala
@@ -20,15 +20,15 @@ object RemoveHTTPHeader {
val headerEnd = "\r\n\r\n"
/** Remove HTTP headers.
- *
- * @param content string of WARC or ARC-based text content
- * @return string with HTTP headers removed.
- */
+ *
+ * @param content string of WARC or ARC-based text content
+ * @return string with HTTP headers removed.
+ */
def apply(content: String): String = {
val maybeContent: Option[String] = Option(content)
maybeContent match {
case Some(content) =>
- if (content.startsWith("HTTP/")){
+ if (content.startsWith("HTTP/")) {
content.substring(content.indexOf(headerEnd) + headerEnd.length)
} else {
content
diff --git a/src/main/scala/io/archivesunleashed/matchbox/package.scala b/src/main/scala/io/archivesunleashed/matchbox/package.scala
index ca0b8bf4..fe954517 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/package.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/package.scala
@@ -20,7 +20,6 @@ import java.io.IOException
import java.security.MessageDigest
import scala.xml.Utility.escape
-
/** Package object which supplies implicits providing common UDF-related functionalities. */
package object matchbox {
implicit class WWWLink(s: String) {
@@ -28,16 +27,16 @@ package object matchbox {
val maybeString: Option[String] = Option(s)
maybeString match {
case Some(s) => s.replaceAll("^\\s*www\\.", "")
- case None => ""
+ case None => ""
}
}
def escapeInvalidXML(): String = {
try {
escape(s)
- }
- catch {
- case e: Exception => throw new IOException("Caught exception processing input row ", e)
+ } catch {
+ case e: Exception =>
+ throw new IOException("Caught exception processing input row ", e)
}
}
}
diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala
index eadd14c3..3e6a4d8c 100644
--- a/src/main/scala/io/archivesunleashed/package.scala
+++ b/src/main/scala/io/archivesunleashed/package.scala
@@ -20,14 +20,32 @@ import java.security.MessageDigest
import java.util.Base64
import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat
-import io.archivesunleashed.data.{ArchiveRecordInputFormat, ArchiveRecordWritable}
+import io.archivesunleashed.data.{
+ ArchiveRecordInputFormat,
+ ArchiveRecordWritable
+}
import ArchiveRecordWritable.ArchiveFormat
-import io.archivesunleashed.udfs.{detectLanguage, detectMimeTypeTika, extractDate, extractDomain, removeHTML}
+import io.archivesunleashed.udfs.{
+ detectLanguage,
+ detectMimeTypeTika,
+ extractDate,
+ extractDomain,
+ removeHTML
+}
-import io.archivesunleashed.matchbox.{DetectLanguage, DetectMimeTypeTika, ExtractDate,
- ExtractDomain, ExtractImageDetails, ExtractImageLinks,
- ExtractLinks, GetExtensionMIME, RemoveHTML, RemoveHTTPHeader}
+import io.archivesunleashed.matchbox.{
+ DetectLanguage,
+ DetectMimeTypeTika,
+ ExtractDate,
+ ExtractDomain,
+ ExtractImageDetails,
+ ExtractImageLinks,
+ ExtractLinks,
+ GetExtensionMIME,
+ RemoveHTML,
+ RemoveHTTPHeader
+}
import io.archivesunleashed.matchbox.ExtractDate.DateComponent
import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent
import java.net.URI
@@ -38,7 +56,13 @@ import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.LongWritable
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.{lit, udf}
-import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType, StructField, StructType}
+import org.apache.spark.sql.types.{
+ BinaryType,
+ IntegerType,
+ StringType,
+ StructField,
+ StructType
+}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.{RangePartitioner, SerializableWritable, SparkContext}
import scala.reflect.ClassTag
@@ -49,8 +73,10 @@ import scala.util.Try
* Package object which supplies implicits to augment generic RDDs with AUT-specific transformations.
*/
package object archivesunleashed {
+
/** Loads records from either WARCs or ARCs. */
object RecordLoader {
+
/** Gets all non-empty archive files.
*
* @param dir the path to the directory containing archive files
@@ -59,7 +85,9 @@ package object archivesunleashed {
*/
def getFiles(dir: Path, fs: FileSystem): String = {
val statuses = fs.globStatus(dir)
- val files = statuses.filter(f => fs.getContentSummary(f.getPath).getLength > 0).map(f => f.getPath)
+ val files = statuses
+ .filter(f => fs.getContentSummary(f.getPath).getLength > 0)
+ .map(f => f.getPath)
files.mkString(",")
}
@@ -73,17 +101,26 @@ package object archivesunleashed {
val uri = new URI(path)
val fs = FileSystem.get(uri, sc.hadoopConfiguration)
val p = new Path(path)
- sc.newAPIHadoopFile(getFiles(p, fs), classOf[ArchiveRecordInputFormat], classOf[LongWritable], classOf[ArchiveRecordWritable])
- .filter(r => (r._2.getFormat == ArchiveFormat.ARC) ||
- ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response")))
- .map(r => new ArchiveRecordImpl(new SerializableWritable(r._2)))
+ sc.newAPIHadoopFile(
+ getFiles(p, fs),
+ classOf[ArchiveRecordInputFormat],
+ classOf[LongWritable],
+ classOf[ArchiveRecordWritable]
+ ).filter(r =>
+ (r._2.getFormat == ArchiveFormat.ARC) ||
+ ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader
+ .getHeaderValue("WARC-Type")
+ .equals("response"))
+ ).map(r => new ArchiveRecordImpl(new SerializableWritable(r._2)))
}
}
/** A Wrapper class around RDD to simplify counting. */
- implicit class CountableRDD[T: ClassTag](rdd: RDD[T]) extends java.io.Serializable {
+ implicit class CountableRDD[T: ClassTag](rdd: RDD[T])
+ extends java.io.Serializable {
def countItems(): RDD[(T, Int)] = {
- rdd.map(r => (r, 1))
+ rdd
+ .map(r => (r, 1))
.reduceByKey((c1, c2) => c1 + c2)
.sortBy(f => f._2, ascending = false)
}
@@ -104,13 +141,13 @@ package object archivesunleashed {
/** Removes all non-html-based data (images, executables, etc.) from html text. */
def keepValidPagesDF(): DataFrame = {
df.filter($"crawl_date" isNotNull)
- .filter(!($"url".rlike(".*robots\\.txt$")) &&
- ( $"mime_type_web_server".rlike("text/html") ||
- $"mime_type_web_server".rlike("application/xhtml+xml") ||
- $"url".rlike("(?i).*htm$") ||
- $"url".rlike("(?i).*html$")
- )
- )
+ .filter(
+ !($"url".rlike(".*robots\\.txt$")) &&
+ ($"mime_type_web_server".rlike("text/html") ||
+ $"mime_type_web_server".rlike("application/xhtml+xml") ||
+ $"url".rlike("(?i).*htm$") ||
+ $"url".rlike("(?i).*html$"))
+ )
.filter($"http_status_code" === 200)
}
}
@@ -120,14 +157,24 @@ package object archivesunleashed {
*
* To load such an RDD, please see [[RecordLoader]].
*/
- implicit class WARecordRDD(rdd: RDD[ArchiveRecord]) extends java.io.Serializable {
+ implicit class WARecordRDD(rdd: RDD[ArchiveRecord])
+ extends java.io.Serializable {
/* Creates a column for Bytes as well in Dataframe.
Call KeepImages OR KeepValidPages on RDD depending upon the requirement before calling this method */
def all(): DataFrame = {
- val records = rdd.map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType,
- DetectMimeTypeTika(r.getBinaryBytes), r.getContentString,
- r.getBinaryBytes, r.getHttpStatus, r.getArchiveFilename))
+ val records = rdd.map(r =>
+ Row(
+ r.getCrawlDate,
+ r.getUrl,
+ r.getMimeType,
+ DetectMimeTypeTika(r.getBinaryBytes),
+ r.getContentString,
+ r.getBinaryBytes,
+ r.getHttpStatus,
+ r.getArchiveFilename
+ )
+ )
val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
@@ -148,20 +195,28 @@ package object archivesunleashed {
rdd.filter(r =>
r.getCrawlDate != null
&& (r.getMimeType == "text/html"
- || r.getMimeType == "application/xhtml+xml"
- || r.getUrl.toLowerCase.endsWith("htm")
- || r.getUrl.toLowerCase.endsWith("html"))
+ || r.getMimeType == "application/xhtml+xml"
+ || r.getUrl.toLowerCase.endsWith("htm")
+ || r.getUrl.toLowerCase.endsWith("html"))
&& !r.getUrl.toLowerCase.endsWith("robots.txt")
- && r.getHttpStatus == "200")
+ && r.getHttpStatus == "200"
+ )
}
/** Extracts webpages with columns for crawl data, url, MIME type, and content. */
def webpages(): DataFrame = {
- val records = rdd.keepValidPages()
- .map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType,
- DetectMimeTypeTika(r.getBinaryBytes),
- DetectLanguage(RemoveHTML(RemoveHTTPHeader(r.getContentString))),
- r.getContentString))
+ val records = rdd
+ .keepValidPages()
+ .map(r =>
+ Row(
+ r.getCrawlDate,
+ r.getUrl,
+ r.getMimeType,
+ DetectMimeTypeTika(r.getBinaryBytes),
+ DetectLanguage(RemoveHTML(RemoveHTTPHeader(r.getContentString))),
+ r.getContentString
+ )
+ )
val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
@@ -179,8 +234,10 @@ package object archivesunleashed {
def webgraph(): DataFrame = {
val records = rdd
.keepValidPages()
- .flatMap(r => ExtractLinks(r.getUrl, r.getContentString)
- .map(t => (r.getCrawlDate, t._1, t._2, t._3)))
+ .flatMap(r =>
+ ExtractLinks(r.getUrl, r.getContentString)
+ .map(t => (r.getCrawlDate, t._1, t._2, t._3))
+ )
.filter(t => t._2 != "" && t._3 != "")
.map(t => Row(t._1, t._2, t._3, t._4))
@@ -198,8 +255,10 @@ package object archivesunleashed {
def imagegraph(): DataFrame = {
val records = rdd
.keepValidPages()
- .flatMap(r => ExtractImageLinks(r.getUrl, r.getContentString)
- .map(t => (r.getCrawlDate, t._1, t._2, t._3)))
+ .flatMap(r =>
+ ExtractImageLinks(r.getUrl, r.getContentString)
+ .map(t => (r.getCrawlDate, t._1, t._2, t._3))
+ )
.filter(t => t._2 != "" && t._3 != "")
.map(t => Row(t._1, t._2, t._3, t._4))
@@ -219,14 +278,40 @@ package object archivesunleashed {
.keepImages()
.map(r => {
val mimeTypeTika = DetectMimeTypeTika(r.getBinaryBytes)
- val image = ExtractImageDetails(r.getUrl, mimeTypeTika, r.getBinaryBytes)
+ val image =
+ ExtractImageDetails(r.getUrl, mimeTypeTika, r.getBinaryBytes)
val url = new URL(r.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMIME(url.getPath(), mimeTypeTika)
- (r.getCrawlDate, r.getUrl, filename, extension, r.getMimeType, mimeTypeTika,
- image.width, image.height, image.md5Hash, image.sha1Hash, image.body)
+ (
+ r.getCrawlDate,
+ r.getUrl,
+ filename,
+ extension,
+ r.getMimeType,
+ mimeTypeTika,
+ image.width,
+ image.height,
+ image.md5Hash,
+ image.sha1Hash,
+ image.body
+ )
})
- .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10, t._11))
+ .map(t =>
+ Row(
+ t._1,
+ t._2,
+ t._3,
+ t._4,
+ t._5,
+ t._6,
+ t._7,
+ t._8,
+ t._9,
+ t._10,
+ t._11
+ )
+ )
val schema = new StructType()
.add(StructField("crawl_date", StringType, true))
@@ -248,20 +333,31 @@ package object archivesunleashed {
/* Extract PDF bytes and PDF metadata. */
def pdfs(): DataFrame = {
val records = rdd
- .map(r =>
- (r, (DetectMimeTypeTika(r.getBinaryBytes)))
- )
+ .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes))))
.filter(r => r._2 == "application/pdf")
.map(r => {
val bytes = r._1.getBinaryBytes
- val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
- val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
+ val md5Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+ )
+ val sha1Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+ )
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMIME(url.getPath(), r._2)
- (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
- DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
+ (
+ r._1.getCrawlDate,
+ r._1.getUrl,
+ filename,
+ extension,
+ r._1.getMimeType,
+ DetectMimeTypeTika(r._1.getBinaryBytes),
+ md5Hash,
+ sha1Hash,
+ encodedBytes
+ )
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
@@ -283,20 +379,31 @@ package object archivesunleashed {
/* Extract audio bytes and audio metadata. */
def audio(): DataFrame = {
val records = rdd
- .map(r =>
- (r, (DetectMimeTypeTika(r.getBinaryBytes)))
- )
+ .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes))))
.filter(r => r._2.startsWith("audio/"))
.map(r => {
val bytes = r._1.getBinaryBytes
- val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
- val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
+ val md5Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+ )
+ val sha1Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+ )
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMIME(url.getPath(), r._2)
- (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
- DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
+ (
+ r._1.getCrawlDate,
+ r._1.getUrl,
+ filename,
+ extension,
+ r._1.getMimeType,
+ DetectMimeTypeTika(r._1.getBinaryBytes),
+ md5Hash,
+ sha1Hash,
+ encodedBytes
+ )
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
@@ -318,20 +425,31 @@ package object archivesunleashed {
/* Extract video bytes and video metadata. */
def videos(): DataFrame = {
val records = rdd
- .map(r =>
- (r, (DetectMimeTypeTika(r.getBinaryBytes)))
- )
+ .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes))))
.filter(r => r._2.startsWith("video/"))
.map(r => {
val bytes = r._1.getBinaryBytes
- val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
- val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
+ val md5Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+ )
+ val sha1Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+ )
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMIME(url.getPath(), r._2)
- (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
- DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
+ (
+ r._1.getCrawlDate,
+ r._1.getUrl,
+ filename,
+ extension,
+ r._1.getMimeType,
+ DetectMimeTypeTika(r._1.getBinaryBytes),
+ md5Hash,
+ sha1Hash,
+ encodedBytes
+ )
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
@@ -353,39 +471,43 @@ package object archivesunleashed {
/* Extract spreadsheet bytes and spreadsheet metadata. */
def spreadsheets(): DataFrame = {
val records = rdd
- .map(r =>
- (r, (DetectMimeTypeTika(r.getBinaryBytes)))
- )
- .filter(r => (r._2 == "application/vnd.ms-excel"
- || r._2 == "application/vnd.ms-excel.workspace.3"
- || r._2 == "application/vnd.ms-excel.workspace.4"
- || r._2 == "application/vnd.ms-excel.sheet.2"
- || r._2 == "application/vnd.ms-excel.sheet.3"
- || r._2 == "application/vnd.ms-excel.sheet.3"
- || r._2 == "application/vnd.ms-excel.addin.macroenabled.12"
- || r._2 == "application/vnd.ms-excel.sheet.binary.macroenabled.12"
- || r._2 == "application/vnd.ms-excel.sheet.macroenabled.12"
- || r._2 == "application/vnd.ms-excel.template.macroenabled.12"
- || r._2 == "application/vnd.ms-spreadsheetml"
- || r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.template"
- || r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
- || r._2 == "application/x-vnd.oasis.opendocument.spreadsheet-template"
- || r._2 == "application/vnd.oasis.opendocument.spreadsheet-template"
- || r._2 == "application/vnd.oasis.opendocument.spreadsheet"
- || r._2 == "application/x-vnd.oasis.opendocument.spreadsheet"
- || r._2 == "application/x-tika-msworks-spreadsheet"
- || r._2 == "application/vnd.lotus-1-2-3"
- || r._2 == "text/csv" // future versions of Tika?
- || r._2 == "text/tab-separated-values" // " "
- || r._1.getMimeType == "text/csv"
- || r._1.getMimeType == "text/tab-separated-values")
- || ((r._1.getUrl.toLowerCase.endsWith(".csv")
- || r._1.getUrl.toLowerCase.endsWith(".tsv"))
- && r._2 == "text/plain"))
+ .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes))))
+ .filter(r =>
+ (r._2 == "application/vnd.ms-excel"
+ || r._2 == "application/vnd.ms-excel.workspace.3"
+ || r._2 == "application/vnd.ms-excel.workspace.4"
+ || r._2 == "application/vnd.ms-excel.sheet.2"
+ || r._2 == "application/vnd.ms-excel.sheet.3"
+ || r._2 == "application/vnd.ms-excel.sheet.3"
+ || r._2 == "application/vnd.ms-excel.addin.macroenabled.12"
+ || r._2 == "application/vnd.ms-excel.sheet.binary.macroenabled.12"
+ || r._2 == "application/vnd.ms-excel.sheet.macroenabled.12"
+ || r._2 == "application/vnd.ms-excel.template.macroenabled.12"
+ || r._2 == "application/vnd.ms-spreadsheetml"
+ || r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.template"
+ || r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+ || r._2 == "application/x-vnd.oasis.opendocument.spreadsheet-template"
+ || r._2 == "application/vnd.oasis.opendocument.spreadsheet-template"
+ || r._2 == "application/vnd.oasis.opendocument.spreadsheet"
+ || r._2 == "application/x-vnd.oasis.opendocument.spreadsheet"
+ || r._2 == "application/x-tika-msworks-spreadsheet"
+ || r._2 == "application/vnd.lotus-1-2-3"
+ || r._2 == "text/csv" // future versions of Tika?
+ || r._2 == "text/tab-separated-values" // " "
+ || r._1.getMimeType == "text/csv"
+ || r._1.getMimeType == "text/tab-separated-values")
+ || ((r._1.getUrl.toLowerCase.endsWith(".csv")
+ || r._1.getUrl.toLowerCase.endsWith(".tsv"))
+ && r._2 == "text/plain")
+ )
.map(r => {
val bytes = r._1.getBinaryBytes
- val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
- val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
+ val md5Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+ )
+ val sha1Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+ )
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
@@ -398,8 +520,17 @@ package object archivesunleashed {
}
}
val extension = GetExtensionMIME(url.getPath(), mimeType)
- (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
- DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
+ (
+ r._1.getCrawlDate,
+ r._1.getUrl,
+ filename,
+ extension,
+ r._1.getMimeType,
+ DetectMimeTypeTika(r._1.getBinaryBytes),
+ md5Hash,
+ sha1Hash,
+ encodedBytes
+ )
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
@@ -421,32 +552,45 @@ package object archivesunleashed {
/* Extract presentation program bytes and presentation program metadata. */
def presentationProgramFiles(): DataFrame = {
val records = rdd
- .map(r =>
- (r, (DetectMimeTypeTika(r.getBinaryBytes)))
- )
- .filter(r => r._2 == "application/vnd.ms-powerpoint"
- || r._2 == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
- || r._2 == "application/vnd.oasis.opendocument.presentation"
- || r._2 == "application/vnd.oasis.opendocument.presentation-template"
- || r._2 == "application/vnd.sun.xml.impress"
- || r._2 == "application/vnd.sun.xml.impress.template"
- || r._2 == "application/vnd.stardivision.impress"
- || r._2 == "application/x-starimpress"
- || r._2 == "application/vnd.ms-powerpoint.addin.macroEnabled.12"
- || r._2 == "application/vnd.ms-powerpoint.presentation.macroEnabled.12"
- || r._2 == "application/vnd.ms-powerpoint.slide.macroEnabled.12"
- || r._2 == "application/vnd.ms-powerpoint.slideshow.macroEnabled.12"
- || r._2 == "application/vnd.ms-powerpoint.template.macroEnabled.12")
+ .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes))))
+ .filter(r =>
+ r._2 == "application/vnd.ms-powerpoint"
+ || r._2 == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+ || r._2 == "application/vnd.oasis.opendocument.presentation"
+ || r._2 == "application/vnd.oasis.opendocument.presentation-template"
+ || r._2 == "application/vnd.sun.xml.impress"
+ || r._2 == "application/vnd.sun.xml.impress.template"
+ || r._2 == "application/vnd.stardivision.impress"
+ || r._2 == "application/x-starimpress"
+ || r._2 == "application/vnd.ms-powerpoint.addin.macroEnabled.12"
+ || r._2 == "application/vnd.ms-powerpoint.presentation.macroEnabled.12"
+ || r._2 == "application/vnd.ms-powerpoint.slide.macroEnabled.12"
+ || r._2 == "application/vnd.ms-powerpoint.slideshow.macroEnabled.12"
+ || r._2 == "application/vnd.ms-powerpoint.template.macroEnabled.12"
+ )
.map(r => {
val bytes = r._1.getBinaryBytes
- val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
- val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
+ val md5Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+ )
+ val sha1Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+ )
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMIME(url.getPath(), r._2)
- (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
- DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
+ (
+ r._1.getCrawlDate,
+ r._1.getUrl,
+ filename,
+ extension,
+ r._1.getMimeType,
+ DetectMimeTypeTika(r._1.getBinaryBytes),
+ md5Hash,
+ sha1Hash,
+ encodedBytes
+ )
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
@@ -468,37 +612,50 @@ package object archivesunleashed {
/* Extract word processor bytes and word processor metadata. */
def wordProcessorFiles(): DataFrame = {
val records = rdd
- .map(r =>
- (r, (DetectMimeTypeTika(r.getBinaryBytes)))
- )
- .filter(r => r._2 == "application/vnd.lotus-wordpro"
- || r._2 == "application/vnd.kde.kword"
- || r._2 == "application/vnd.ms-word.document.macroEnabled.12"
- || r._2 == "application/vnd.ms-word.template.macroEnabled.12"
- || r._2 == "application/vnd.oasis.opendocument.text"
- || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml"
- || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
- || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml"
- || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"
- || r._2 == "application/vnd.wordperfect"
- || r._2 == "application/wordperfect5.1"
- || r._2 == "application/msword"
- || r._2 == "application/vnd.ms-word.document.macroEnabled.12"
- || r._2 == "application/vnd.ms-word.template.macroEnabled.12"
- || r._2 == "application/vnd.apple.pages"
- || r._2 == "application/macwriteii"
- || r._2 == "application/vnd.ms-works"
- || r._2 == "application/rtf")
+ .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes))))
+ .filter(r =>
+ r._2 == "application/vnd.lotus-wordpro"
+ || r._2 == "application/vnd.kde.kword"
+ || r._2 == "application/vnd.ms-word.document.macroEnabled.12"
+ || r._2 == "application/vnd.ms-word.template.macroEnabled.12"
+ || r._2 == "application/vnd.oasis.opendocument.text"
+ || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml"
+ || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+ || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml"
+ || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"
+ || r._2 == "application/vnd.wordperfect"
+ || r._2 == "application/wordperfect5.1"
+ || r._2 == "application/msword"
+ || r._2 == "application/vnd.ms-word.document.macroEnabled.12"
+ || r._2 == "application/vnd.ms-word.template.macroEnabled.12"
+ || r._2 == "application/vnd.apple.pages"
+ || r._2 == "application/macwriteii"
+ || r._2 == "application/vnd.ms-works"
+ || r._2 == "application/rtf"
+ )
.map(r => {
val bytes = r._1.getBinaryBytes
- val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)))
- val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)))
+ val md5Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+ )
+ val sha1Hash = new String(
+ Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+ )
val encodedBytes = Base64.getEncoder.encodeToString(bytes)
val url = new URL(r._1.getUrl)
val filename = FilenameUtils.getName(url.getPath())
val extension = GetExtensionMIME(url.getPath(), r._2)
- (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType,
- DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes)
+ (
+ r._1.getCrawlDate,
+ r._1.getUrl,
+ filename,
+ extension,
+ r._1.getMimeType,
+ DetectMimeTypeTika(r._1.getBinaryBytes),
+ md5Hash,
+ sha1Hash,
+ encodedBytes
+ )
})
.map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
@@ -519,8 +676,10 @@ package object archivesunleashed {
/** Removes all data except images. */
def keepImages(): RDD[ArchiveRecord] = {
- rdd.filter(r => r.getCrawlDate != null
- && DetectMimeTypeTika(r.getBinaryBytes).startsWith("image/"))
+ rdd.filter(r =>
+ r.getCrawlDate != null
+ && DetectMimeTypeTika(r.getBinaryBytes).startsWith("image/")
+ )
}
/** Removes all data but selected mimeTypes specified.
@@ -540,9 +699,9 @@ package object archivesunleashed {
}
/** Removes all data that does not have selected HTTP status codes.
- *
- * @param statusCodes a list of HTTP status codes
- */
+ *
+ * @param statusCodes a list of HTTP status codes
+ */
def keepHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord] = {
rdd.filter(r => statusCodes.contains(r.getHttpStatus))
}
@@ -552,7 +711,10 @@ package object archivesunleashed {
* @param dates a list of dates
* @param component the selected DateComponent enum value
*/
- def keepDate(dates: List[String], component: DateComponent = DateComponent.YYYYMMDD): RDD[ArchiveRecord] = {
+ def keepDate(
+ dates: List[String],
+ component: DateComponent = DateComponent.YYYYMMDD
+ ): RDD[ArchiveRecord] = {
rdd.filter(r => dates.contains(ExtractDate(r.getCrawlDate, component)))
}
@@ -570,11 +732,15 @@ package object archivesunleashed {
*/
def keepUrlPatterns(urlREs: Set[Regex]): RDD[ArchiveRecord] = {
rdd.filter(r =>
- urlREs.map(re =>
- r.getUrl match {
- case re() => true
- case _ => false
- }).exists(identity))
+ urlREs
+ .map(re =>
+ r.getUrl match {
+ case re() => true
+ case _ => false
+ }
+ )
+ .exists(identity)
+ )
}
/** Removes all data but selected source domains.
@@ -582,7 +748,9 @@ package object archivesunleashed {
* @param urls a list of urls for the source domains
*/
def keepDomains(urls: Set[String]): RDD[ArchiveRecord] = {
- rdd.filter(r => urls.contains(ExtractDomain(r.getUrl).replace("^\\s*www\\.", "")))
+ rdd.filter(r =>
+ urls.contains(ExtractDomain(r.getUrl).replace("^\\s*www\\.", ""))
+ )
}
/** Removes all data not in selected language.
@@ -590,7 +758,9 @@ package object archivesunleashed {
* @param lang a set of ISO 639-2 codes
*/
def keepLanguages(lang: Set[String]): RDD[ArchiveRecord] = {
- rdd.filter(r => lang.contains(DetectLanguage(RemoveHTML(r.getContentString))))
+ rdd.filter(r =>
+ lang.contains(DetectLanguage(RemoveHTML(r.getContentString)))
+ )
}
/** Removes all content that does not pass Regular Expression test.
@@ -599,11 +769,15 @@ package object archivesunleashed {
*/
def keepContent(contentREs: Set[Regex]): RDD[ArchiveRecord] = {
rdd.filter(r =>
- contentREs.map(re =>
- (re findFirstIn r.getContentString) match {
- case Some(v) => true
- case None => false
- }).exists(identity))
+ contentREs
+ .map(re =>
+ (re findFirstIn r.getContentString) match {
+ case Some(v) => true
+ case None => false
+ }
+ )
+ .exists(identity)
+ )
}
/** Filters ArchiveRecord MimeTypes (web server).
@@ -647,16 +821,20 @@ package object archivesunleashed {
}
/** Filters detected URL patterns (regex).
- *
- * @param urlREs a list of Regular expressions
- */
+ *
+ * @param urlREs a list of Regular expressions
+ */
def discardUrlPatterns(urlREs: Set[Regex]): RDD[ArchiveRecord] = {
rdd.filter(r =>
- !urlREs.map(re =>
- r.getUrl match {
- case re() => true
- case _ => false
- }).exists(identity))
+ !urlREs
+ .map(re =>
+ r.getUrl match {
+ case re() => true
+ case _ => false
+ }
+ )
+ .exists(identity)
+ )
}
/** Filters detected domains (regex).
@@ -673,11 +851,15 @@ package object archivesunleashed {
*/
def discardContent(contentREs: Set[Regex]): RDD[ArchiveRecord] = {
rdd.filter(r =>
- !contentREs.map(re =>
- (re findFirstIn r.getContentString) match {
- case Some(v) => true
- case None => false
- }).exists(identity))
+ !contentREs
+ .map(re =>
+ (re findFirstIn r.getContentString) match {
+ case Some(v) => true
+ case None => false
+ }
+ )
+ .exists(identity)
+ )
}
/** Filters detected language.
@@ -685,7 +867,9 @@ package object archivesunleashed {
* @param lang a set of ISO 639-2 codes
*/
def discardLanguages(lang: Set[String]): RDD[ArchiveRecord] = {
- rdd.filter(r => !lang.contains(DetectLanguage(RemoveHTML(r.getContentString))))
+ rdd.filter(r =>
+ !lang.contains(DetectLanguage(RemoveHTML(r.getContentString)))
+ )
}
}
}
diff --git a/src/main/scala/io/archivesunleashed/udfs/package.scala b/src/main/scala/io/archivesunleashed/udfs/package.scala
index db6ddef8..30e6000e 100644
--- a/src/main/scala/io/archivesunleashed/udfs/package.scala
+++ b/src/main/scala/io/archivesunleashed/udfs/package.scala
@@ -16,57 +16,103 @@
package io.archivesunleashed
-import io.archivesunleashed.matchbox.{ComputeImageSize, ComputeMD5, ComputeSHA1,
- DetectLanguage, DetectMimeTypeTika,
- ExtractBoilerpipeText, ExtractDate,
- ExtractDomain, ExtractImageLinks, ExtractLinks,
- GetExtensionMIME, RemoveHTML, RemoveHTTPHeader}
+import io.archivesunleashed.matchbox.{
+ ComputeImageSize,
+ ComputeMD5,
+ ComputeSHA1,
+ DetectLanguage,
+ DetectMimeTypeTika,
+ ExtractBoilerpipeText,
+ ExtractDate,
+ ExtractDomain,
+ ExtractImageLinks,
+ ExtractLinks,
+ GetExtensionMIME,
+ RemoveHTML,
+ RemoveHTTPHeader
+}
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.SparkSession
import scala.util.matching.Regex
-/** Package object providing UDFs for DataFrames in Scala and PySpark. **/
+/** Package object providing UDFs for DataFrames in Scala and PySpark. * */
package object udfs extends Serializable {
// Matchbox
- def computeImageSize: UserDefinedFunction = udf(ComputeImageSize.apply(_: Array[Byte]))
+ def computeImageSize: UserDefinedFunction =
+ udf(ComputeImageSize.apply(_: Array[Byte]))
def computeMD5: UserDefinedFunction = udf(ComputeMD5.apply(_: Array[Byte]))
def computeSHA1: UserDefinedFunction = udf(ComputeSHA1.apply(_: Array[Byte]))
def detectLanguage: UserDefinedFunction = udf(DetectLanguage.apply(_: String))
- def detectMimeTypeTika: UserDefinedFunction = udf(DetectMimeTypeTika.apply(_: Array[Byte]))
- def extractBoilerpipeText: UserDefinedFunction = udf(ExtractBoilerpipeText.apply(_: String))
- def extractDate: UserDefinedFunction = udf(ExtractDate.apply(_: String, _: String))
- def extractDomain: UserDefinedFunction = udf(ExtractDomain.apply(_: String, ""))
- def extractImageLinks: UserDefinedFunction = udf(ExtractImageLinks.apply(_: String, _: String))
- def extractLinks: UserDefinedFunction = udf(ExtractLinks.apply(_: String, _: String))
- def getExtensionMime: UserDefinedFunction = udf(GetExtensionMIME.apply(_: String, _: String))
+ def detectMimeTypeTika: UserDefinedFunction =
+ udf(DetectMimeTypeTika.apply(_: Array[Byte]))
+ def extractBoilerpipeText: UserDefinedFunction =
+ udf(ExtractBoilerpipeText.apply(_: String))
+ def extractDate: UserDefinedFunction =
+ udf(ExtractDate.apply(_: String, _: String))
+ def extractDomain: UserDefinedFunction =
+ udf(ExtractDomain.apply(_: String, ""))
+ def extractImageLinks: UserDefinedFunction =
+ udf(ExtractImageLinks.apply(_: String, _: String))
+ def extractLinks: UserDefinedFunction =
+ udf(ExtractLinks.apply(_: String, _: String))
+ def getExtensionMime: UserDefinedFunction =
+ udf(GetExtensionMIME.apply(_: String, _: String))
def removeHTML: UserDefinedFunction = udf(RemoveHTML.apply(_: String))
- def removeHTTPHeader: UserDefinedFunction = udf(RemoveHTTPHeader.apply(_: String))
- def removePrefixWWW: UserDefinedFunction = udf[String, String](_.replaceAll("^\\s*www\\.", ""))
+ def removeHTTPHeader: UserDefinedFunction =
+ udf(RemoveHTTPHeader.apply(_: String))
+ def removePrefixWWW: UserDefinedFunction =
+ udf[String, String](_.replaceAll("^\\s*www\\.", ""))
// Filters
- def hasContent: UserDefinedFunction = udf((c: String, contentREs: Seq[String]) => {
- contentREs.map(re =>
- (re.r findFirstIn c) match {
- case Some(v) => true
- case None => false
- }).exists(identity)
- })
- def hasDate: UserDefinedFunction = udf((date_ : String, date: Seq[String]) => date.contains(date_))
- def hasDomains: UserDefinedFunction = udf((domain: String, domains: Seq[String]) => domains.contains(domain))
- def hasHTTPStatus: UserDefinedFunction = udf((statusCode: String, statusCodes: Seq[String]) => statusCodes.contains(statusCode))
- def hasImages: UserDefinedFunction = udf((date: String, mimeType: String) => date != null && mimeType.startsWith("image/"))
- def hasLanguages: UserDefinedFunction = udf((language: String, languages: Seq[String]) => languages.contains(language))
- def hasMIMETypes: UserDefinedFunction = udf((mimeType: String, mimeTypes: Seq[String]) => mimeTypes.contains(mimeType))
- def hasMIMETypesTika: UserDefinedFunction = udf((mimeType: String, mimeTypesTika: Seq[String]) => mimeTypesTika.contains(mimeType))
- def hasUrlPatterns: UserDefinedFunction = udf((urlPattern: String, urlREs: Seq[String]) => {
- urlREs.map(re =>
- urlPattern match {
- case re.r() => true
- case _ => false
- }).exists(identity)
- })
- def hasUrls: UserDefinedFunction = udf((url: String, urls: Seq[String]) => urls.contains(url))
+ def hasContent: UserDefinedFunction =
+ udf((c: String, contentREs: Seq[String]) => {
+ contentREs
+ .map(re =>
+ (re.r findFirstIn c) match {
+ case Some(v) => true
+ case None => false
+ }
+ )
+ .exists(identity)
+ })
+ def hasDate: UserDefinedFunction =
+ udf((date_ : String, date: Seq[String]) => date.contains(date_))
+ def hasDomains: UserDefinedFunction =
+ udf((domain: String, domains: Seq[String]) => domains.contains(domain))
+ def hasHTTPStatus: UserDefinedFunction =
+ udf((statusCode: String, statusCodes: Seq[String]) =>
+ statusCodes.contains(statusCode)
+ )
+ def hasImages: UserDefinedFunction =
+ udf((date: String, mimeType: String) =>
+ date != null && mimeType.startsWith("image/")
+ )
+ def hasLanguages: UserDefinedFunction =
+ udf((language: String, languages: Seq[String]) =>
+ languages.contains(language)
+ )
+ def hasMIMETypes: UserDefinedFunction =
+ udf((mimeType: String, mimeTypes: Seq[String]) =>
+ mimeTypes.contains(mimeType)
+ )
+ def hasMIMETypesTika: UserDefinedFunction =
+ udf((mimeType: String, mimeTypesTika: Seq[String]) =>
+ mimeTypesTika.contains(mimeType)
+ )
+ def hasUrlPatterns: UserDefinedFunction =
+ udf((urlPattern: String, urlREs: Seq[String]) => {
+ urlREs
+ .map(re =>
+ urlPattern match {
+ case re.r() => true
+ case _ => false
+ }
+ )
+ .exists(identity)
+ })
+ def hasUrls: UserDefinedFunction =
+ udf((url: String, urls: Seq[String]) => urls.contains(url))
}
diff --git a/src/test/scala/io/archivesunleashed/ArcTest.scala b/src/test/scala/io/archivesunleashed/ArcTest.scala
index ec3852d2..a879338c 100644
--- a/src/test/scala/io/archivesunleashed/ArcTest.scala
+++ b/src/test/scala/io/archivesunleashed/ArcTest.scala
@@ -18,7 +18,13 @@ package io.archivesunleashed
import com.google.common.io.Resources
import io.archivesunleashed.matchbox.ExtractDate.DateComponent
-import io.archivesunleashed.matchbox.{DetectLanguage, DetectMimeTypeTika, ExtractLinks, RemoveHTML, RemoveHTTPHeader}
+import io.archivesunleashed.matchbox.{
+ DetectLanguage,
+ DetectMimeTypeTika,
+ ExtractLinks,
+ RemoveHTML,
+ RemoveHTTPHeader
+}
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
@@ -48,38 +54,46 @@ class ArcTest extends FunSuite with BeforeAndAfter {
test("Filter date RDD") {
val startSS = 0
val monthSS = 6
- val four = RecordLoader.loadArchives(arcPath, sc)
+ val four = RecordLoader
+ .loadArchives(arcPath, sc)
.keepDate(List("200804", dayMonthTestA), DateComponent.YYYYMM)
.map(r => r.getCrawlDate)
.collect()
- val five = RecordLoader.loadArchives(arcPath, sc)
- .keepDate(List(dayMonthTestA,"200807"), DateComponent.YYYYMM)
+ val five = RecordLoader
+ .loadArchives(arcPath, sc)
+ .keepDate(List(dayMonthTestA, "200807"), DateComponent.YYYYMM)
.map(r => r.getCrawlDate)
.collect()
four.foreach(date => assert(date.substring(startSS, monthSS) == "200804"))
- five.foreach(date => assert(date.substring(startSS, monthSS) == dayMonthTestA))
+ five.foreach(date =>
+ assert(date.substring(startSS, monthSS) == dayMonthTestA)
+ )
}
test("Filter URL pattern RDD") {
- val keepMatches = RecordLoader.loadArchives(arcPath, sc)
+ val keepMatches = RecordLoader
+ .loadArchives(arcPath, sc)
.keepUrlPatterns(Set("http://www.archive.org/about/.*".r))
- val discardMatches = RecordLoader.loadArchives(arcPath, sc)
- .discardUrlPatterns(Set("http://www.archive.org/about/.*".r))
+ val discardMatches = RecordLoader
+ .loadArchives(arcPath, sc)
+ .discardUrlPatterns(Set("http://www.archive.org/about/.*".r))
assert(keepMatches.count == 16L)
assert(discardMatches.count == 284L)
}
test("Count links RDD") {
- val links = RecordLoader.loadArchives(arcPath, sc)
+ val links = RecordLoader
+ .loadArchives(arcPath, sc)
.map(r => ExtractLinks(r.getUrl, r.getContentString))
.reduce((a, b) => a ++ b)
assert(links.size == 664)
}
test("Detect language RDD") {
- val languageCounts = RecordLoader.loadArchives(arcPath, sc)
+ val languageCounts = RecordLoader
+ .loadArchives(arcPath, sc)
.keepMimeTypes(Set("text/html"))
.map(r => RemoveHTML(r.getContentString))
.groupBy(content => DetectLanguage(content))
@@ -95,30 +109,32 @@ class ArcTest extends FunSuite with BeforeAndAfter {
case ("lt", count) => assert(61L == count)
case ("no", count) => assert(6L == count)
case ("ro", count) => assert(4L == count)
- case (_, count) => print(_)
+ case (_, count) => print(_)
}
}
test("Detect MIMEtype Tika RDD") {
- val mimeTypeCounts = RecordLoader.loadArchives(arcPath, sc)
+ val mimeTypeCounts = RecordLoader
+ .loadArchives(arcPath, sc)
.map(r => RemoveHTTPHeader(r.getContentString))
.groupBy(content => DetectMimeTypeTika(content.getBytes))
.map(f => {
(f._1, f._2.size)
- }).collect
+ })
+ .collect
mimeTypeCounts.foreach {
- case ("image/gif", count) => assert(29L == count)
- case ("image/png", count) => assert(8L == count)
- case ("image/jpeg", count) => assert(18L == count)
- case ("text/html", count) => assert(132L == count)
- case ("text/plain", count) => assert(86L == count)
- case ("application/xml", count) => assert(1L == count)
- case ("application/rss+xml", count) => assert(9L == count)
- case ("application/xhtml+xml", count) => assert(1L == count)
- case ("application/octet-stream", count) => assert(26L == count)
+ case ("image/gif", count) => assert(29L == count)
+ case ("image/png", count) => assert(8L == count)
+ case ("image/jpeg", count) => assert(18L == count)
+ case ("text/html", count) => assert(132L == count)
+ case ("text/plain", count) => assert(86L == count)
+ case ("application/xml", count) => assert(1L == count)
+ case ("application/rss+xml", count) => assert(9L == count)
+ case ("application/xhtml+xml", count) => assert(1L == count)
+ case ("application/octet-stream", count) => assert(26L == count)
case ("application/x-shockwave-flash", count) => assert(8L == count)
- case (_, count) => print(_)
+ case (_, count) => print(_)
}
}
diff --git a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
index 0cdc196b..29d461dd 100644
--- a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
+++ b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
@@ -52,79 +52,144 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
}
test("Resource name produces expected result") {
- val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
- .map(x => FilenameUtils.getName(x.getArchiveFilename))
- .take(3)
- val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
- .map(x => FilenameUtils.getName(x.getArchiveFilename)).take(3)
- assert(textSampleArc.deep == Array(exampleArc,
- exampleArc, exampleArc).deep)
- assert(textSampleWarc.deep == Array(exampleWarc,
- exampleWarc, exampleWarc).deep)
+ val textSampleArc = RecordLoader
+ .loadArchives(arcPath, sc)
+ .map(x => FilenameUtils.getName(x.getArchiveFilename))
+ .take(3)
+ val textSampleWarc = RecordLoader
+ .loadArchives(warcPath, sc)
+ .map(x => FilenameUtils.getName(x.getArchiveFilename))
+ .take(3)
+ assert(textSampleArc.deep == Array(exampleArc, exampleArc, exampleArc).deep)
+ assert(
+ textSampleWarc.deep == Array(exampleWarc, exampleWarc, exampleWarc).deep
+ )
}
test("Crawl Dates") {
- val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
- .map(x => x.getCrawlDate).take(3)
- val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
- .map(x => x.getCrawlDate).take(3)
- assert(textSampleArc.deep == Array(exampleDate, exampleDate, exampleDate).deep)
- assert(textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep)
+ val textSampleArc = RecordLoader
+ .loadArchives(arcPath, sc)
+ .map(x => x.getCrawlDate)
+ .take(3)
+ val textSampleWarc = RecordLoader
+ .loadArchives(warcPath, sc)
+ .map(x => x.getCrawlDate)
+ .take(3)
+ assert(
+ textSampleArc.deep == Array(exampleDate, exampleDate, exampleDate).deep
+ )
+ assert(
+ textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep
+ )
}
test("Domains") {
- val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
- .map(x => x.getDomain).take(3)
- val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
- .map(x => x.getDomain).take(3)
+ val textSampleArc = RecordLoader
+ .loadArchives(arcPath, sc)
+ .map(x => x.getDomain)
+ .take(3)
+ val textSampleWarc = RecordLoader
+ .loadArchives(warcPath, sc)
+ .map(x => x.getDomain)
+ .take(3)
assert(textSampleArc.deep == Array("", "", exampleUrl).deep)
assert(textSampleWarc.deep == Array("", exampleUrl, exampleUrl).deep)
}
test("URLs") {
- val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
- .map(x => x.getUrl).take(3)
- val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
- .map(x => x.getUrl).take(3)
- assert(textSampleArc.deep == Array("filedesc://IAH-20080430204825-00000-blackbook.arc",
- "dns:www.archive.org", "http://www.archive.org/robots.txt").deep)
- assert(textSampleWarc.deep == Array("dns:www.archive.org",
- "http://www.archive.org/robots.txt", "http://www.archive.org/").deep)
+ val textSampleArc = RecordLoader
+ .loadArchives(arcPath, sc)
+ .map(x => x.getUrl)
+ .take(3)
+ val textSampleWarc = RecordLoader
+ .loadArchives(warcPath, sc)
+ .map(x => x.getUrl)
+ .take(3)
+ assert(
+ textSampleArc.deep == Array(
+ "filedesc://IAH-20080430204825-00000-blackbook.arc",
+ "dns:www.archive.org",
+ "http://www.archive.org/robots.txt"
+ ).deep
+ )
+ assert(
+ textSampleWarc.deep == Array(
+ "dns:www.archive.org",
+ "http://www.archive.org/robots.txt",
+ "http://www.archive.org/"
+ ).deep
+ )
}
test("MIMEtype") {
- val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
- .map(x => x.getMimeType).take(3)
- val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
- .map(x => x.getMimeType).take(3)
- assert (textSampleArc.deep == Array (exampleMimeType, "text/dns",
- exampleMimeType).deep)
- assert (textSampleWarc.deep == Array("unknown", exampleMimeType,
- "text/html").deep)
+ val textSampleArc = RecordLoader
+ .loadArchives(arcPath, sc)
+ .map(x => x.getMimeType)
+ .take(3)
+ val textSampleWarc = RecordLoader
+ .loadArchives(warcPath, sc)
+ .map(x => x.getMimeType)
+ .take(3)
+ assert(
+ textSampleArc.deep == Array(
+ exampleMimeType,
+ "text/dns",
+ exampleMimeType
+ ).deep
+ )
+ assert(
+ textSampleWarc.deep == Array("unknown", exampleMimeType, "text/html").deep
+ )
}
test("Get HTTP status") {
- val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
- .map(x => x.getHttpStatus).take(3)
- val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
- .map(x => x.getHttpStatus).take(3)
- assert (textSampleArc.deep == Array(exampleStatusCode1, exampleStatusCode1,
- exampleStatusCode2).deep)
- assert (textSampleWarc.deep == Array(exampleStatusCode1, exampleStatusCode2,
- exampleStatusCode2).deep)
+ val textSampleArc = RecordLoader
+ .loadArchives(arcPath, sc)
+ .map(x => x.getHttpStatus)
+ .take(3)
+ val textSampleWarc = RecordLoader
+ .loadArchives(warcPath, sc)
+ .map(x => x.getHttpStatus)
+ .take(3)
+ assert(
+ textSampleArc.deep == Array(
+ exampleStatusCode1,
+ exampleStatusCode1,
+ exampleStatusCode2
+ ).deep
+ )
+ assert(
+ textSampleWarc.deep == Array(
+ exampleStatusCode1,
+ exampleStatusCode2,
+ exampleStatusCode2
+ ).deep
+ )
}
test("Get Payload Digest") {
- val textSampleArc = RecordLoader.loadArchives(arcPath, sc)
- .map(x => x.getPayloadDigest).take(3)
- val textSampleWarc = RecordLoader.loadArchives(warcPath, sc)
- .map(x => x.getPayloadDigest).take(3)
- assert (textSampleArc.deep == Array("sha1:252efd6dd414d91812dd9b0f897cdb2b44f64601",
- "sha1:8d115d0e83c5dcd66b13619e04d60a36cb2c1ee4",
- "sha1:ede22581685942721c7b9743dced317633d00e33").deep)
- assert (textSampleWarc.deep == Array(null,
- "sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U",
- "sha1:2WAXX5NUWNNCS2BDKCO5OVDQBJVNKIVV").deep)
+ val textSampleArc = RecordLoader
+ .loadArchives(arcPath, sc)
+ .map(x => x.getPayloadDigest)
+ .take(3)
+ val textSampleWarc = RecordLoader
+ .loadArchives(warcPath, sc)
+ .map(x => x.getPayloadDigest)
+ .take(3)
+ assert(
+ textSampleArc.deep == Array(
+ "sha1:252efd6dd414d91812dd9b0f897cdb2b44f64601",
+ "sha1:8d115d0e83c5dcd66b13619e04d60a36cb2c1ee4",
+ "sha1:ede22581685942721c7b9743dced317633d00e33"
+ ).deep
+ )
+ assert(
+ textSampleWarc.deep == Array(
+ null,
+ "sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U",
+ "sha1:2WAXX5NUWNNCS2BDKCO5OVDQBJVNKIVV"
+ ).deep
+ )
}
after {
diff --git a/src/test/scala/io/archivesunleashed/CountableRDDTest.scala b/src/test/scala/io/archivesunleashed/CountableRDDTest.scala
index 395479ae..ec099531 100644
--- a/src/test/scala/io/archivesunleashed/CountableRDDTest.scala
+++ b/src/test/scala/io/archivesunleashed/CountableRDDTest.scala
@@ -39,7 +39,8 @@ class CountableRDDTest extends FunSuite with BeforeAndAfter {
}
test("Count records; Extract Domain RDD ") {
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
.map(r => ExtractDomain(r.getUrl))
val r = base
diff --git a/src/test/scala/io/archivesunleashed/RecordDFTest.scala b/src/test/scala/io/archivesunleashed/RecordDFTest.scala
index 994ffc0d..978bbcfb 100644
--- a/src/test/scala/io/archivesunleashed/RecordDFTest.scala
+++ b/src/test/scala/io/archivesunleashed/RecordDFTest.scala
@@ -16,11 +16,22 @@
package io.archivesunleashed
-import io.archivesunleashed.udfs.{detectLanguage, detectMimeTypeTika,
- extractDomain, removeHTML,
- hasContent, hasDate, hasDomains, hasHTTPStatus,
- hasImages, hasLanguages, hasMIMETypes,
- hasMIMETypesTika, hasUrlPatterns, hasUrls}
+import io.archivesunleashed.udfs.{
+ detectLanguage,
+ detectMimeTypeTika,
+ extractDomain,
+ removeHTML,
+ hasContent,
+ hasDate,
+ hasDomains,
+ hasHTTPStatus,
+ hasImages,
+ hasLanguages,
+ hasMIMETypes,
+ hasMIMETypesTika,
+ hasUrlPatterns,
+ hasUrls
+}
import com.google.common.io.Resources
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
@@ -46,12 +57,13 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
test("Keep valid pages DF") {
val expected = "http://www.archive.org/"
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.keepValidPagesDF()
.take(1)(0)(1)
- assert (base.toString == expected)
+ assert(base.toString == expected)
}
test("Has HTTP Status") {
@@ -61,13 +73,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
// scalastyle:on
val expected = "000"
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.select($"http_status_code")
- .filter(hasHTTPStatus($"http_status_code", lit(Array("200","000"))))
+ .filter(hasHTTPStatus($"http_status_code", lit(Array("200", "000"))))
.take(1)(0)(0)
- assert (base.toString == expected)
+ assert(base.toString == expected)
}
test("Has URLs") {
@@ -78,20 +91,32 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
val expected1 = "http://www.archive.org/robots.txt"
val expected2 = "http://www.archive.org/"
- val base1 = RecordLoader.loadArchives(arcPath, sc)
+ val base1 = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.select($"url")
- .filter(hasUrls($"url", lit(Array("http://www.archive.org/","http://www.archive.org/robots.txt"))))
+ .filter(
+ hasUrls(
+ $"url",
+ lit(
+ Array(
+ "http://www.archive.org/",
+ "http://www.archive.org/robots.txt"
+ )
+ )
+ )
+ )
.take(1)(0)(0)
- val base2 = RecordLoader.loadArchives(arcPath, sc)
+ val base2 = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.select($"url")
.filter(hasUrls($"url", lit(Array("http://www.archive.org/"))))
.take(1)(0)(0)
- assert (base1.toString == expected1)
- assert (base2.toString == expected2)
+ assert(base1.toString == expected1)
+ assert(base2.toString == expected2)
}
test("Has domains") {
@@ -101,13 +126,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
// scalastyle:on
val expected = "http://www.archive.org/robots.txt"
- val base1 = RecordLoader.loadArchives(arcPath, sc)
+ val base1 = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.select($"url")
.filter(hasDomains(extractDomain($"url"), lit(Array("www.archive.org"))))
.take(1)(0)(0)
- assert (base1.toString == expected)
+ assert(base1.toString == expected)
}
test("Has MIME Types") {
@@ -117,13 +143,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
// scalastyle:on
val expected = "text/html"
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.select($"mime_type_web_server")
.filter(hasMIMETypes($"mime_type_web_server", lit(Array("text/html"))))
.take(1)(0)(0)
- assert (base.toString == expected)
+ assert(base.toString == expected)
}
test("Has MIME Types Tika") {
@@ -133,13 +160,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
// scalastyle:on
val expected = "text/html"
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.select($"mime_type_web_server")
.filter(hasMIMETypesTika($"mime_type_tika", lit(Array("text/html"))))
.take(1)(0)(0)
- assert (base.toString == expected)
+ assert(base.toString == expected)
}
test("Has Content") {
@@ -149,13 +177,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
// scalastyle:on
val expected = "http://www.archive.org/images/logoc.jpg"
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
- .select($"url",$"content")
+ .select($"url", $"content")
.filter(hasContent($"content", lit(Array("Content-Length: [0-9]{4}"))))
.take(1)(0)(0)
- assert (base.toString == expected)
+ assert(base.toString == expected)
}
test("Has URL Patterns") {
@@ -165,21 +194,23 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
// scalastyle:on
val expected1 = "http://www.archive.org/images/go-button-gateway.gif"
- val base1 = RecordLoader.loadArchives(arcPath, sc)
+ val base1 = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.select($"url")
.filter(hasUrlPatterns($"url", lit(Array(".*images.*"))))
.take(2)(1)(0)
val expected2 = "http://www.archive.org/index.php?skin=classic"
- val base2 = RecordLoader.loadArchives(arcPath, sc)
+ val base2 = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.select($"url")
.filter(hasUrlPatterns($"url", lit(Array(".*index.*"))))
.take(3)(1)(0)
- assert (base1.toString == expected1)
- assert (base2.toString == expected2)
+ assert(base1.toString == expected1)
+ assert(base2.toString == expected2)
}
test("Has Languages") {
@@ -189,13 +220,19 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
// scalastyle:on
val expected = "de"
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.select(detectLanguage(removeHTML($"content")).as("language"))
- .filter(hasLanguages(detectLanguage(removeHTML($"content")), lit(Array("de","ht"))))
+ .filter(
+ hasLanguages(
+ detectLanguage(removeHTML($"content")),
+ lit(Array("de", "ht"))
+ )
+ )
.take(1)(0)(0)
- assert (base.toString == expected)
+ assert(base.toString == expected)
}
test("Has Images") {
@@ -205,13 +242,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
// scalastyle:on
val expected = "image/jpeg"
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.select($"mime_type_tika")
.filter(hasImages($"crawl_date", detectMimeTypeTika($"bytes")))
.take(1)(0)(0)
- assert (base.toString == expected)
+ assert(base.toString == expected)
}
test("Has Date") {
@@ -221,13 +259,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
// scalastyle:on
val expected = Array("20080430")
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.all()
.select($"crawl_date")
.filter(hasDate($"crawl_date", lit(expected)))
.take(1)(0)(0)
- assert (base.toString == "20080430")
+ assert(base.toString == "20080430")
}
after {
diff --git a/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala b/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala
index 9260e666..f5a7c802 100644
--- a/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala
+++ b/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala
@@ -38,7 +38,8 @@ class RecordLoaderTest extends FunSuite with BeforeAndAfter {
}
test("Load WARC") {
- val base = RecordLoader.loadArchives(warcPath, sc)
+ val base = RecordLoader
+ .loadArchives(warcPath, sc)
.keepValidPages()
.map(x => x.getUrl)
.take(1)
diff --git a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala
index 6c5efcd6..a324aa33 100644
--- a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala
+++ b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala
@@ -45,16 +45,20 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
test("Expect no valid pages RDD") {
val expectedLength = 0
- val base = RecordLoader.loadArchives(badPath, sc)
- .keepValidPages().take(2)
- assert (base.length == expectedLength)
+ val base = RecordLoader
+ .loadArchives(badPath, sc)
+ .keepValidPages()
+ .take(2)
+ assert(base.length == expectedLength)
}
- test ("Expect no images RDD") {
+ test("Expect no images RDD") {
val expectedLength = 0
- val base = RecordLoader.loadArchives(badPath, sc)
- .keepValidPages().take(2)
- assert (base.length == expectedLength)
+ val base = RecordLoader
+ .loadArchives(badPath, sc)
+ .keepValidPages()
+ .take(2)
+ assert(base.length == expectedLength)
}
test("Keep date RDD") {
@@ -62,170 +66,214 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
val base = RecordLoader.loadArchives(arcPath, sc)
val component = DateComponent.YYYY
val r = base
- .filter (x => ExtractDate(x.getCrawlDate, component) == testDate)
- .map ( mp => mp.getUrl).take(3)
- val r2 = base.keepDate(List(testDate), component)
- .map ( mp => mp.getUrl).take(3)
- assert (r2.sameElements(r)) }
+ .filter(x => ExtractDate(x.getCrawlDate, component) == testDate)
+ .map(mp => mp.getUrl)
+ .take(3)
+ val r2 = base
+ .keepDate(List(testDate), component)
+ .map(mp => mp.getUrl)
+ .take(3)
+ assert(r2.sameElements(r))
+ }
- test ("Keep HTTP status codes RDD") {
+ test("Keep HTTP status codes RDD") {
val expected = 94
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
- val statusCodes: Set[String] = Set ("200", "404")
+ val statusCodes: Set[String] = Set("200", "404")
val r2 = base.keepHttpStatus(statusCodes).count
- assert (r2 == expected)
+ assert(r2 == expected)
}
- test ("Keep URLs RDD") {
+ test("Keep URLs RDD") {
val expected = 1
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
- val urls: Set[String] = Set (archive, sloan)
+ val urls: Set[String] = Set(archive, sloan)
val r2 = base.keepUrls(urls).count
- assert (r2 == expected)
+ assert(r2 == expected)
}
- test ("Keep URL patterns RDD") {
+ test("Keep URL patterns RDD") {
val expected = 1
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
- val urls = Set (archive.r, sloan.r, "".r)
+ val urls = Set(archive.r, sloan.r, "".r)
val r2 = base.keepUrlPatterns(urls).count
- assert (r2 == expected)
+ assert(r2 == expected)
}
- test ("Keep domains RDD") {
+ test("Keep domains RDD") {
val expected = 91
- val base2 = RecordLoader.loadArchives(arcPath, sc)
+ val base2 = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
val urls: Set[String] = Set("www.archive.org", "www.sloan.org")
val x2 = base2.keepDomains(urls).count()
- assert (x2 == expected )
+ assert(x2 == expected)
}
- test ("Keep languages RDD") {
- val base2 = RecordLoader.loadArchives(arcPath, sc)
+ test("Keep languages RDD") {
+ val base2 = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
val langs: Set[String] = Set("en", "fr")
- val r = Array("http://www.archive.org/",
- "http://www.archive.org/index.php")
- val r2 = base2.keepLanguages(langs)
- .map(r => r.getUrl).take(2)
- assert (r2.sameElements(r))
+ val r = Array("http://www.archive.org/", "http://www.archive.org/index.php")
+ val r2 = base2
+ .keepLanguages(langs)
+ .map(r => r.getUrl)
+ .take(2)
+ assert(r2.sameElements(r))
}
- test ("Discard languages RDD") {
- val base2 = RecordLoader.loadArchives(arcPath, sc)
+ test("Discard languages RDD") {
+ val base2 = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
val langs: Set[String] = Set("fr")
val r = Array("http://www.archive.org/", "http://www.archive.org/index.php")
- val r2 = base2.discardLanguages(langs)
- .map(r => r.getUrl).take(2)
- assert (r2.sameElements(r))
+ val r2 = base2
+ .discardLanguages(langs)
+ .map(r => r.getUrl)
+ .take(2)
+ assert(r2.sameElements(r))
}
- test ("Keep MIMEtype Tika RDD") {
+ test("Keep MIMEtype Tika RDD") {
val base = RecordLoader.loadArchives(arcPath, sc)
- val mime = Set ("text/plain", "image/jpeg")
- val r2 = base.keepMimeTypesTika(mime)
- .map (mp => mp.getUrl).take(3)
- assert (r2.deep == Array("dns:www.archive.org",
- "http://www.archive.org/robots.txt",
- "http://www.archive.org/images/logoc.jpg").deep)
+ val mime = Set("text/plain", "image/jpeg")
+ val r2 = base
+ .keepMimeTypesTika(mime)
+ .map(mp => mp.getUrl)
+ .take(3)
+ assert(
+ r2.deep == Array(
+ "dns:www.archive.org",
+ "http://www.archive.org/robots.txt",
+ "http://www.archive.org/images/logoc.jpg"
+ ).deep
+ )
}
- test ("Keep MIMEtype RDD") {
+ test("Keep MIMEtype RDD") {
val base = RecordLoader.loadArchives(arcPath, sc)
- val mime = Set ("text/plain", "image/jpeg")
- val r2 = base.keepMimeTypes(mime)
- .map (mp => mp.getUrl).take(3)
- assert (r2.deep == Array("filedesc://IAH-20080430204825-00000-blackbook.arc",
- "http://www.archive.org/robots.txt",
- "http://www.archive.org/images/logoc.jpg").deep)
+ val mime = Set("text/plain", "image/jpeg")
+ val r2 = base
+ .keepMimeTypes(mime)
+ .map(mp => mp.getUrl)
+ .take(3)
+ assert(
+ r2.deep == Array(
+ "filedesc://IAH-20080430204825-00000-blackbook.arc",
+ "http://www.archive.org/robots.txt",
+ "http://www.archive.org/images/logoc.jpg"
+ ).deep
+ )
}
- test ("Keep content RDD"){
+ test("Keep content RDD") {
val expected = 1
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
val regno = Set(regex, raw"UNINTELLIBLEDFSJKLS".r)
val y2 = base.keepContent(Set(regex)).count()
val y1 = base.keepContent(regno).count()
- assert (y2 == expected)
- assert (y1 == expected)
+ assert(y2 == expected)
+ assert(y1 == expected)
}
- test ("Discard MIMEtype RDD") {
+ test("Discard MIMEtype RDD") {
val base = RecordLoader.loadArchives(arcPath, sc)
- val mime = Set ("text/plain", "image/jpeg")
- val r2 = base.discardMimeTypes(mime)
- .map (mp => mp.getUrl).take(3)
- assert (r2.deep == Array("dns:www.archive.org", archive,
- "http://www.archive.org/index.php").deep)
+ val mime = Set("text/plain", "image/jpeg")
+ val r2 = base
+ .discardMimeTypes(mime)
+ .map(mp => mp.getUrl)
+ .take(3)
+ assert(
+ r2.deep == Array(
+ "dns:www.archive.org",
+ archive,
+ "http://www.archive.org/index.php"
+ ).deep
+ )
}
- test ("Discard MIMEtype Tika RDD") {
+ test("Discard MIMEtype Tika RDD") {
val base = RecordLoader.loadArchives(arcPath, sc)
- val mime = Set ("text/plain", "image/jpeg")
- val r2 = base.discardMimeTypesTika(mime)
- .map (mp => mp.getUrl).take(3)
- assert (r2.deep == Array("filedesc://IAH-20080430204825-00000-blackbook.arc",
- "http://www.archive.org/", "http://www.archive.org/index.php").deep)
+ val mime = Set("text/plain", "image/jpeg")
+ val r2 = base
+ .discardMimeTypesTika(mime)
+ .map(mp => mp.getUrl)
+ .take(3)
+ assert(
+ r2.deep == Array(
+ "filedesc://IAH-20080430204825-00000-blackbook.arc",
+ "http://www.archive.org/",
+ "http://www.archive.org/index.php"
+ ).deep
+ )
}
- test ("Discard date RDD") {
+ test("Discard date RDD") {
val base = RecordLoader.loadArchives(arcPath, sc)
val date = "20080430"
- val r = base.filter( x=> x.getCrawlDate != date).collect()
+ val r = base.filter(x => x.getCrawlDate != date).collect()
val r2 = base.discardDate(date).take(3)
- assert (r.deep == Array().deep)
+ assert(r.deep == Array().deep)
}
- test ("Discard URLs RDD") {
+ test("Discard URLs RDD") {
val expected = 94
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
- val urls: Set[String] = Set (sloan)
+ val urls: Set[String] = Set(sloan)
val r2 = base.discardUrls(urls).count()
- assert (r2 == expected)
+ assert(r2 == expected)
}
- test ("Discard URL patterns RDD") {
+ test("Discard URL patterns RDD") {
val expected = 93
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
- val urls = Set (archive.r, sloan.r, "".r)
+ val urls = Set(archive.r, sloan.r, "".r)
val r2 = base.discardUrlPatterns(urls).count
- assert (r2 == expected)
+ assert(r2 == expected)
}
- test ("Discard HTTP status codes RDD") {
+ test("Discard HTTP status codes RDD") {
val expected = 46
val base = RecordLoader.loadArchives(arcPath, sc)
- val statusCodes: Set[String] = Set ("200", "404")
+ val statusCodes: Set[String] = Set("200", "404")
val r2 = base.discardHttpStatus(statusCodes).count
- assert (r2 == expected)
+ assert(r2 == expected)
}
- test ("Discard domains RDD") {
+ test("Discard domains RDD") {
val expected = 94
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
- val urls: Set[String] = Set ("www.sloan.org")
+ val urls: Set[String] = Set("www.sloan.org")
val r2 = base.discardDomains(urls).count()
- assert (r2 == expected)
+ assert(r2 == expected)
}
- test ("Discard content RDD") {
+ test("Discard content RDD") {
val expected = 93
- val base = RecordLoader.loadArchives(arcPath, sc)
+ val base = RecordLoader
+ .loadArchives(arcPath, sc)
.keepValidPages()
val regno = Set(regex, raw"UNINTELLIBLEDFSJKLS".r)
val y2 = base.discardContent(Set(regex)).count()
val y1 = base.discardContent(regno).count()
- assert (y2 == expected)
- assert (y1 == expected)
+ assert(y2 == expected)
+ assert(y1 == expected)
}
after {
diff --git a/src/test/scala/io/archivesunleashed/WarcTest.scala b/src/test/scala/io/archivesunleashed/WarcTest.scala
index 5eb2b112..44b9879f 100644
--- a/src/test/scala/io/archivesunleashed/WarcTest.scala
+++ b/src/test/scala/io/archivesunleashed/WarcTest.scala
@@ -58,7 +58,8 @@ class WarcTest extends FunSuite with BeforeAndAfter {
}
test("WARC get content RDD") {
- val a = RecordLoader.loadArchives(warcPath, sc)
+ val a = RecordLoader
+ .loadArchives(warcPath, sc)
.map(r => r.getContentString)
.take(1)
assert(a.head.nonEmpty)
diff --git a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala
index ad8730af..b579b905 100644
--- a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala
@@ -25,7 +25,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class AudioInformationExtractorTest extends FunSuite with BeforeAndAfter {
- private val arcPath = Resources.getResource("warc/example.media.warc.gz").getPath
+ private val arcPath =
+ Resources.getResource("warc/example.media.warc.gz").getPath
private var sc: SparkContext = _
private val master = "local[4]"
private val appName = "example-spark"
diff --git a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala
index 7083fce2..2e6fba90 100644
--- a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala
@@ -43,78 +43,722 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter {
private val webPagesOpt = "WebPagesExtractor"
private var sc: SparkContext = _
private val testSuccessCmds = Array(
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor", "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor", "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor", "--output-format", "parquet", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "gexf"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "graphml"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--output-format", "parquet", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, "--partition", "1", extractOpt, plainTextOpt),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--output-format", "parquet", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--output-format", "parquet", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--output-format", "parquet", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--output-format", "parquet", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--output-format", "parquet", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--output-format", "parquet", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--output-format", "parquet", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--output-format", "parquet", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--output-format", "parquet", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--split"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--partition", "1"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--output-format", "parquet"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--output-format", "parquet", "--partition", "1")
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "DomainFrequencyExtractor"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "DomainFrequencyExtractor",
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "DomainFrequencyExtractor",
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "DomainFrequencyExtractor",
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ domainGraphOpt
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ domainGraphOpt,
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ domainGraphOpt,
+ "--output-format",
+ "gexf"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ domainGraphOpt,
+ "--output-format",
+ "graphml"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ domainGraphOpt,
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ plainTextOpt
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ plainTextOpt,
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ plainTextOpt,
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ plainTextOpt,
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ plainTextOpt,
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ "--partition",
+ "1",
+ extractOpt,
+ plainTextOpt
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ imageGraphOpt
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ imageGraphOpt,
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ imageGraphOpt,
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ imageGraphOpt,
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ imageGraphOpt,
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ webPagesOpt
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ webPagesOpt,
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ webPagesOpt,
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ webPagesOpt,
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ webPagesOpt,
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "AudioInformationExtractor"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "AudioInformationExtractor",
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "AudioInformationExtractor",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "AudioInformationExtractor",
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "AudioInformationExtractor",
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "ImageInformationExtractor"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "ImageInformationExtractor",
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "ImageInformationExtractor",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "ImageInformationExtractor",
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "ImageInformationExtractor",
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "PDFInformationExtractor"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "PDFInformationExtractor",
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "PDFInformationExtractor",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "PDFInformationExtractor",
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "PDFInformationExtractor",
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "PresentationProgramInformationExtractor"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "PresentationProgramInformationExtractor",
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "PresentationProgramInformationExtractor",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "PresentationProgramInformationExtractor",
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "PresentationProgramInformationExtractor",
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "SpreadsheetInformationExtractor"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "SpreadsheetInformationExtractor",
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "SpreadsheetInformationExtractor",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "SpreadsheetInformationExtractor",
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "SpreadsheetInformationExtractor",
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "VideoInformationExtractor"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "VideoInformationExtractor",
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "VideoInformationExtractor",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "VideoInformationExtractor",
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "VideoInformationExtractor",
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "WordProcessorInformationExtractor"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "WordProcessorInformationExtractor",
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "WordProcessorInformationExtractor",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "WordProcessorInformationExtractor",
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "WordProcessorInformationExtractor",
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "WebGraphExtractor"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "WebGraphExtractor",
+ "--split"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "WebGraphExtractor",
+ "--partition",
+ "1"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "WebGraphExtractor",
+ "--output-format",
+ "parquet"
+ ),
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "WebGraphExtractor",
+ "--output-format",
+ "parquet",
+ "--partition",
+ "1"
+ )
)
private val testFailCmds = Array(
Array(inputOpt, "_abracadabra", outputOpt, outputDir),
Array(outputOpt, outputDir),
Array(inputOpt, "_abracadabra"),
- Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "abracadabra")
+ Array(
+ inputOpt,
+ arcPath,
+ warcPath,
+ outputOpt,
+ outputDir,
+ extractOpt,
+ "abracadabra"
+ )
)
before {
@@ -125,19 +769,19 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter {
}
test("Command line app functionality tests") {
- for {a <- testSuccessCmds} {
+ for { a <- testSuccessCmds } {
app.CommandLineAppRunner.test(a, sc)
assert(Files.exists(Paths.get(outputDir)))
FileUtils.deleteDirectory(new File(outputDir))
}
- for {a <- testFailCmds} {
+ for { a <- testFailCmds } {
try {
app.CommandLineAppRunner.test(a, sc)
assert(false)
} catch {
case e: IllegalArgumentException => assert(true)
- case _: Throwable => assert(false)
+ case _: Throwable => assert(false)
} finally {
assert(!Files.exists(Paths.get(outputDir)))
}
diff --git a/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesDFTest.scala b/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesDFTest.scala
index cc75e31b..e40ed89b 100644
--- a/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesDFTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesDFTest.scala
@@ -33,8 +33,8 @@ class ExtractPopularImagesDFTest extends FunSuite with BeforeAndAfter {
before {
val conf = new SparkConf()
- .setMaster(master)
- .setAppName(appName)
+ .setMaster(master)
+ .setAppName(appName)
conf.set("spark.driver.allowMultipleContexts", "true");
sc = new SparkContext(conf)
}
@@ -45,8 +45,8 @@ class ExtractPopularImagesDFTest extends FunSuite with BeforeAndAfter {
val imagesLowLimit = ExtractPopularImagesDF(exampledf, 3)
val imagesHighLimit = ExtractPopularImagesDF(exampledf, highTest)
val response = "1"
- assert (imagesLowLimit.take(1)(0)(1).toString == response)
- assert (imagesHighLimit.take(1)(0)(1).toString == response)
+ assert(imagesLowLimit.take(1)(0)(1).toString == response)
+ assert(imagesHighLimit.take(1)(0)(1).toString == response)
}
after {
if (sc != null) {
diff --git a/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala b/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala
index 51d58c93..c9a1db77 100644
--- a/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala
@@ -33,8 +33,8 @@ class ExtractPopularImagesTest extends FunSuite with BeforeAndAfter {
before {
val conf = new SparkConf()
- .setMaster(master)
- .setAppName(appName)
+ .setMaster(master)
+ .setAppName(appName)
conf.set("spark.driver.allowMultipleContexts", "true");
sc = new SparkContext(conf)
}
@@ -44,11 +44,13 @@ class ExtractPopularImagesTest extends FunSuite with BeforeAndAfter {
val examplerdd = RecordLoader.loadArchives(arcPath, sc)
val imagesLowLimit = ExtractPopularImages(examplerdd, 3, sc)
val imagesHighLimit = ExtractPopularImages(examplerdd, highTest, sc)
- val response = Array("1\thttp://www.archive.org/images/books-small.jpg",
+ val response = Array(
+ "1\thttp://www.archive.org/images/books-small.jpg",
"1\thttp://i.creativecommons.org/l/by-sa/3.0/88x31.png",
- "1\thttp://www.archive.org/images/blendbar.jpg")
- assert (imagesLowLimit.take(3).deep == response.deep)
- assert (imagesHighLimit.take(3).deep == response.deep)
+ "1\thttp://www.archive.org/images/blendbar.jpg"
+ )
+ assert(imagesLowLimit.take(3).deep == response.deep)
+ assert(imagesHighLimit.take(3).deep == response.deep)
}
after {
if (sc != null) {
diff --git a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala
index 9ef66f7b..ab88280c 100644
--- a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala
@@ -25,7 +25,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter {
- private val arcPath = Resources.getResource("warc/example.pdf.warc.gz").getPath
+ private val arcPath =
+ Resources.getResource("warc/example.pdf.warc.gz").getPath
private var sc: SparkContext = _
private val master = "local[4]"
private val appName = "example-spark"
@@ -44,7 +45,11 @@ class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 2
assert(dfResults.length == RESULTSLENGTH)
- assert(dfResults(0).get(0) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y")
+ assert(
+ dfResults(0).get(
+ 0
+ ) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y"
+ )
assert(dfResults(0).get(1) == "cost-analysis.pdf")
assert(dfResults(0).get(2) == "pdf")
assert(dfResults(0).get(3) == "application/pdf")
diff --git a/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala
index 4622ad76..4bd75644 100644
--- a/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala
@@ -45,12 +45,20 @@ class PlainTextExtractorTest extends FunSuite with BeforeAndAfter {
assert(dfResults.length == RESULTSLENGTH)
assert(dfResults(0).get(0) == "")
- assert(dfResults(4).get(0)
- .toString
- .startsWith("Author: Spivak, John L. (John Louis), b. 1897 Published: 1939"))
- assert(dfResults(50).get(0)
- .toString
- .startsWith("How many hours in a day They tell me 24 "))
+ assert(
+ dfResults(4)
+ .get(0)
+ .toString
+ .startsWith(
+ "Author: Spivak, John L. (John Louis), b. 1897 Published: 1939"
+ )
+ )
+ assert(
+ dfResults(50)
+ .get(0)
+ .toString
+ .startsWith("How many hours in a day They tell me 24 ")
+ )
}
after {
diff --git a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala
index 8be73740..6bdfee35 100644
--- a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala
@@ -24,8 +24,11 @@ import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
-class PresentationProgramInformationExtractorTest extends FunSuite with BeforeAndAfter {
- private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
+class PresentationProgramInformationExtractorTest
+ extends FunSuite
+ with BeforeAndAfter {
+ private val arcPath =
+ Resources.getResource("warc/example.docs.warc.gz").getPath
private var sc: SparkContext = _
private val master = "local[4]"
private val appName = "example-spark"
@@ -44,11 +47,23 @@ class PresentationProgramInformationExtractorTest extends FunSuite with BeforeAn
val RESULTSLENGTH = 2
assert(dfResults.length == RESULTSLENGTH)
- assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx")
+ assert(
+ dfResults(0).get(
+ 0
+ ) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx"
+ )
assert(dfResults(0).get(1) == "aut-test-fixtures.pptx")
assert(dfResults(0).get(2) == "pptx")
- assert(dfResults(0).get(3) == "application/vnd.openxmlformats-officedocument.presentationml.presentation")
- assert(dfResults(0).get(4) == "application/vnd.openxmlformats-officedocument.presentationml.presentation")
+ assert(
+ dfResults(0).get(
+ 3
+ ) == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+ )
+ assert(
+ dfResults(0).get(
+ 4
+ ) == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
+ )
assert(dfResults(0).get(5) == "7a7b1fe4b6d311376eaced9de3b682ee")
assert(dfResults(0).get(6) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec")
}
diff --git a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala
index 0754812f..79b8c781 100644
--- a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala
@@ -25,7 +25,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter {
- private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
+ private val arcPath =
+ Resources.getResource("warc/example.docs.warc.gz").getPath
private var sc: SparkContext = _
private val master = "local[4]"
private val appName = "example-spark"
@@ -44,11 +45,19 @@ class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 4
assert(dfResults.length == RESULTSLENGTH)
- assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods")
+ assert(
+ dfResults(0).get(
+ 0
+ ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods"
+ )
assert(dfResults(0).get(1) == "test-aut-fixture.ods")
assert(dfResults(0).get(2) == "ods")
- assert(dfResults(0).get(3) == "application/vnd.oasis.opendocument.spreadsheet")
- assert(dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet")
+ assert(
+ dfResults(0).get(3) == "application/vnd.oasis.opendocument.spreadsheet"
+ )
+ assert(
+ dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet"
+ )
assert(dfResults(0).get(5) == "7f70280757d8beb2d1bfd6fb1b6ae6e9")
assert(dfResults(0).get(6) == "448c357e78317877a98a399448031a89f1dda6fb")
}
diff --git a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala
index 1d4cec03..ac525428 100644
--- a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala
@@ -25,7 +25,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter {
- private val arcPath = Resources.getResource("warc/example.media.warc.gz").getPath
+ private val arcPath =
+ Resources.getResource("warc/example.media.warc.gz").getPath
private var sc: SparkContext = _
private val master = "local[4]"
private val appName = "example-spark"
@@ -44,7 +45,9 @@ class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter {
val RESULTSLENGTH = 1
assert(dfResults.length == RESULTSLENGTH)
- assert(dfResults(0).get(0) == "https://ruebot.net/2018-11-12%2016.14.11.mp4")
+ assert(
+ dfResults(0).get(0) == "https://ruebot.net/2018-11-12%2016.14.11.mp4"
+ )
assert(dfResults(0).get(1) == "2018-11-12%2016.14.11.mp4")
assert(dfResults(0).get(2) == "mp4")
assert(dfResults(0).get(3) == "video/mp4")
diff --git a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala
index 2668d713..8ea033d3 100644
--- a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala
@@ -24,8 +24,11 @@ import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
-class WordProcessorInformationExtractorTest extends FunSuite with BeforeAndAfter {
- private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
+class WordProcessorInformationExtractorTest
+ extends FunSuite
+ with BeforeAndAfter {
+ private val arcPath =
+ Resources.getResource("warc/example.docs.warc.gz").getPath
private var sc: SparkContext = _
private val master = "local[4]"
private val appName = "example-spark"
@@ -44,7 +47,11 @@ class WordProcessorInformationExtractorTest extends FunSuite with BeforeAndAfter
val RESULTSLENGTH = 3
assert(dfResults.length == RESULTSLENGTH)
- assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf")
+ assert(
+ dfResults(0).get(
+ 0
+ ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf"
+ )
assert(dfResults(0).get(1) == "test-aut-fixtures.rtf")
assert(dfResults(0).get(2) == "rtf")
assert(dfResults(0).get(3) == "application/rtf")
diff --git a/src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala b/src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala
index 41a94c8c..d0812384 100644
--- a/src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala
@@ -25,44 +25,56 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
import scala.io.Source
@RunWith(classOf[JUnitRunner])
-class WriteGEXFTest extends FunSuite with BeforeAndAfter{
+class WriteGEXFTest extends FunSuite with BeforeAndAfter {
private var sc: SparkContext = _
private val master = "local[4]"
private val appName = "example-spark"
- private val network = Seq(("Date1", "Source1", "Destination1", 3),
- ("Date2", "Source2", "Destination2", 4),
- ("Date3", "Source3", "Destination3", 100))
+ private val network = Seq(
+ ("Date1", "Source1", "Destination1", 3),
+ ("Date2", "Source2", "Destination2", 4),
+ ("Date3", "Source3", "Destination3", 100)
+ )
private val testFile = "temporaryTestFile.gexf"
before {
val conf = new SparkConf()
.setMaster(master)
.setAppName(appName)
- conf.set("spark.driver.allowMultipleContexts", "true");
- sc = new SparkContext(conf)
- }
+ conf.set("spark.driver.allowMultipleContexts", "true");
+ sc = new SparkContext(conf)
+ }
test("Creates the GEXF file from Array[Row]") {
val testLines = (0, 12, 22, 34)
if (Files.exists(Paths.get(testFile))) {
new File(testFile).delete()
}
- val networkarray = Array(Row.fromTuple(network(0)),
- Row.fromTuple(network(1)), Row.fromTuple(network(2)))
+ val networkarray = Array(
+ Row.fromTuple(network(0)),
+ Row.fromTuple(network(1)),
+ Row.fromTuple(network(2))
+ )
val ret = WriteGEXF(networkarray, testFile)
assert(ret)
val lines = Source.fromFile(testFile).getLines.toList
assert(lines(testLines._1) == """""")
- assert(lines(testLines._2) == """""")
+ assert(
+ lines(
+ testLines._2
+ ) == """"""
+ )
assert(lines(testLines._3) == """""")
assert(lines(testLines._4) == """""")
- assert(!WriteGEXF(networkarray ,""))
+ assert(!WriteGEXF(networkarray, ""))
}
test("Test if GEXF path is empty") {
val networkGraph = sc.parallelize(network)
- val networkarray = Array(Row.fromTuple(network(0)),
- Row.fromTuple(network(1)), Row.fromTuple(network(2)))
+ val networkarray = Array(
+ Row.fromTuple(network(0)),
+ Row.fromTuple(network(1)),
+ Row.fromTuple(network(2))
+ )
val gexf = WriteGEXF(networkarray, testFile)
assert(gexf)
assert(!WriteGEXF(networkarray, ""))
diff --git a/src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala b/src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala
index 7f1eb73a..ed264490 100644
--- a/src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala
@@ -25,33 +25,38 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
import scala.io.Source
@RunWith(classOf[JUnitRunner])
-class WriteGraphMLTest extends FunSuite with BeforeAndAfter{
+class WriteGraphMLTest extends FunSuite with BeforeAndAfter {
private var sc: SparkContext = _
private val master = "local[4]"
private val appName = "example-spark"
private val linkCountOne = 3
private val linkCountTwo = 4
private val linkCountThree = 100
- private val network = Seq(("Date1", "Source1", "Destination1", linkCountOne),
- ("Date2", "Source2", "Destination2", linkCountTwo),
- ("Date3", "Source3", "Destination3", linkCountThree))
+ private val network = Seq(
+ ("Date1", "Source1", "Destination1", linkCountOne),
+ ("Date2", "Source2", "Destination2", linkCountTwo),
+ ("Date3", "Source3", "Destination3", linkCountThree)
+ )
private val testFile = "temporaryTestFile.graphml"
before {
val conf = new SparkConf()
.setMaster(master)
.setAppName(appName)
- conf.set("spark.driver.allowMultipleContexts", "true");
- sc = new SparkContext(conf)
- }
+ conf.set("spark.driver.allowMultipleContexts", "true");
+ sc = new SparkContext(conf)
+ }
test("Create WriteGraphML file from Array[Row]") {
val lineCheck = (0, 15, 22, 30)
if (Files.exists(Paths.get(testFile))) {
new File(testFile).delete()
}
- val networkarray = Array(Row.fromTuple(network(0)),
- Row.fromTuple(network(1)), Row.fromTuple(network(2)))
+ val networkarray = Array(
+ Row.fromTuple(network(0)),
+ Row.fromTuple(network(1)),
+ Row.fromTuple(network(2))
+ )
val ret = WriteGraphML(networkarray, testFile)
assert(ret)
assert(Files.exists(Paths.get(testFile)))
@@ -62,9 +67,12 @@ class WriteGraphMLTest extends FunSuite with BeforeAndAfter{
assert(lines(lineCheck._4) == """3""")
}
- test ("Test if GraphML path is empty") {
- val networkarray = Array(Row.fromTuple(network(0)),
- Row.fromTuple(network(1)), Row.fromTuple(network(2)))
+ test("Test if GraphML path is empty") {
+ val networkarray = Array(
+ Row.fromTuple(network(0)),
+ Row.fromTuple(network(1)),
+ Row.fromTuple(network(2))
+ )
val graphml = WriteGraphML(networkarray, testFile)
assert(graphml)
assert(!WriteGraphML(networkarray, ""))
diff --git a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
index 1b74d973..9e5cd7fc 100644
--- a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
@@ -25,10 +25,14 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
private val arcPath = Resources.getResource("arc/example.arc.gz").getPath
- private val mediaPath = Resources.getResource("warc/example.media.warc.gz").getPath
- private val docPath = Resources.getResource("warc/example.docs.warc.gz").getPath
- private val txtPath = Resources.getResource("warc/example.txt.warc.gz").getPath
- private val pdfPath = Resources.getResource("warc/example.pdf.warc.gz").getPath
+ private val mediaPath =
+ Resources.getResource("warc/example.media.warc.gz").getPath
+ private val docPath =
+ Resources.getResource("warc/example.docs.warc.gz").getPath
+ private val txtPath =
+ Resources.getResource("warc/example.txt.warc.gz").getPath
+ private val pdfPath =
+ Resources.getResource("warc/example.pdf.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
@@ -67,7 +71,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
val r_3 = imagegraph.take(100)(99)
assert(r_3.get(0) == "20080430")
- assert(r_3.get(1) == "http://www.archive.org/details/secretarmiesb00spivrich")
+ assert(
+ r_3.get(1) == "http://www.archive.org/details/secretarmiesb00spivrich"
+ )
assert(r_3.get(2) == "http://www.archive.org/images/star.png")
val r_4 = images.take(1)(0)
@@ -75,7 +81,11 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
assert(r_4.getAs[String](md5) == "8211d1fbb9b03d8522a1ae378f9d1b24")
val r_5 = pdfs.take(1)(0)
- assert(r_5.getAs[String](url) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y")
+ assert(
+ r_5.getAs[String](
+ url
+ ) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y"
+ )
assert(r_5.getAs[String](md5) == "aaba59d2287afd40c996488a39bbc0dd")
val r_6 = audio.take(1)(0)
@@ -83,19 +93,33 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
assert(r_6.getAs[String](md5) == "f7e7ec84b12c294e19af1ba41732c733")
val r_7 = video.take(1)(0)
- assert(r_7.getAs[String](url) == "https://ruebot.net/2018-11-12%2016.14.11.mp4")
+ assert(
+ r_7.getAs[String](url) == "https://ruebot.net/2018-11-12%2016.14.11.mp4"
+ )
assert(r_7.getAs[String](md5) == "2cde7de3213a87269957033f6315fce2")
val r_8 = spreadsheets.take(1)(0)
- assert(r_8.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods")
+ assert(
+ r_8.getAs[String](
+ url
+ ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods"
+ )
assert(r_8.getAs[String](md5) == "7f70280757d8beb2d1bfd6fb1b6ae6e9")
val r_9 = powerpoint.take(1)(0)
- assert(r_9.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx")
+ assert(
+ r_9.getAs[String](
+ url
+ ) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx"
+ )
assert(r_9.getAs[String](md5) == "7a7b1fe4b6d311376eaced9de3b682ee")
val r_10 = word.take(1)(0)
- assert(r_10.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf")
+ assert(
+ r_10.getAs[String](
+ url
+ ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf"
+ )
assert(r_10.getAs[String](md5) == "e483512b65ba44d71e843c57de2adeb7")
val r_11 = all.select(url, mime_type).take(1)(0)
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala
index beca222b..faef838a 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala
@@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class AudioTest extends FunSuite with BeforeAndAfter {
- private val warcPath = Resources.getResource("warc/example.media.warc.gz").getPath
+ private val warcPath =
+ Resources.getResource("warc/example.media.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
@@ -39,12 +40,22 @@ class AudioTest extends FunSuite with BeforeAndAfter {
}
test("Audio files extraction DF") {
- val df = RecordLoader.loadArchives(warcPath, sc)
+ val df = RecordLoader
+ .loadArchives(warcPath, sc)
.audio()
- val extracted = df.select("url", "filename", "extension",
- "mime_type_web_server", "mime_type_tika", "md5")
- .orderBy(desc("md5")).head(1).toList
+ val extracted = df
+ .select(
+ "url",
+ "filename",
+ "extension",
+ "mime_type_web_server",
+ "mime_type_tika",
+ "md5"
+ )
+ .orderBy(desc("md5"))
+ .head(1)
+ .toList
assert(extracted.size == 1)
assert("https://ruebot.net/files/feniz.mp3" == extracted(0)(0))
assert("feniz.mp3" == extracted(0)(1))
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala
index cd5580ea..b6a41f88 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala
@@ -17,7 +17,12 @@
package io.archivesunleashed
import com.google.common.io.Resources
-import io.archivesunleashed.udfs.{extractDate, extractDomain, extractLinks, removePrefixWWW}
+import io.archivesunleashed.udfs.{
+ extractDate,
+ extractDomain,
+ extractLinks,
+ removePrefixWWW
+}
import org.apache.spark.sql.functions.{array, explode_outer, lower, udf}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
@@ -40,7 +45,8 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
}
test("Extract dates YYYY DF") {
- val df = RecordLoader.loadArchives(arcPath, sc)
+ val df = RecordLoader
+ .loadArchives(arcPath, sc)
.webpages()
val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1))
@@ -52,14 +58,25 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
import org.apache.spark.sql.functions._
// scalastyle:on
- val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"),
- $"url".as("url"),
- extractDate($"crawl_date",lit("YYYY")).as("crawl_date"),
- explode_outer(extractLinks($"url", $"content")).as("link")
- )
- .filter(lower($"content").contains("keynote")) // filtered on keyword internet
-
- val results = interResults.select($"url", $"Domain", $"crawl_date", dest(array($"link")).as("destination_page")).head(3)
+ val interResults = df
+ .select(
+ removePrefixWWW(extractDomain($"url")).as("Domain"),
+ $"url".as("url"),
+ extractDate($"crawl_date", lit("YYYY")).as("crawl_date"),
+ explode_outer(extractLinks($"url", $"content")).as("link")
+ )
+ .filter(
+ lower($"content").contains("keynote")
+ ) // filtered on keyword internet
+
+ val results = interResults
+ .select(
+ $"url",
+ $"Domain",
+ $"crawl_date",
+ dest(array($"link")).as("destination_page")
+ )
+ .head(3)
assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
@@ -69,7 +86,11 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "2008")
- assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html")
+ assert(
+ results(1).get(
+ 3
+ ) == "http://web.archive.org/collections/web/advanced.html"
+ )
assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
@@ -78,7 +99,8 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
}
test("Extract dates YYYYMM DF") {
- val df = RecordLoader.loadArchives(arcPath, sc)
+ val df = RecordLoader
+ .loadArchives(arcPath, sc)
.webpages()
val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1))
@@ -90,14 +112,25 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
import org.apache.spark.sql.functions._
// scalastyle:on
- val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"),
- $"url".as("url"),
- extractDate($"crawl_date",lit("YYYYMM")).as("crawl_date"),
- explode_outer(extractLinks($"url", $"content")).as("link")
- )
- .filter(lower($"content").contains("keynote")) // filtered on keyword internet
-
- val results = interResults.select($"url", $"Domain", $"crawl_date", dest(array($"link")).as("destination_page")).head(3)
+ val interResults = df
+ .select(
+ removePrefixWWW(extractDomain($"url")).as("Domain"),
+ $"url".as("url"),
+ extractDate($"crawl_date", lit("YYYYMM")).as("crawl_date"),
+ explode_outer(extractLinks($"url", $"content")).as("link")
+ )
+ .filter(
+ lower($"content").contains("keynote")
+ ) // filtered on keyword internet
+
+ val results = interResults
+ .select(
+ $"url",
+ $"Domain",
+ $"crawl_date",
+ dest(array($"link")).as("destination_page")
+ )
+ .head(3)
assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
@@ -107,7 +140,11 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "200804")
- assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html")
+ assert(
+ results(1).get(
+ 3
+ ) == "http://web.archive.org/collections/web/advanced.html"
+ )
assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
@@ -116,7 +153,8 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
}
test("Extract dates MM DF") {
- val df = RecordLoader.loadArchives(arcPath, sc)
+ val df = RecordLoader
+ .loadArchives(arcPath, sc)
.webpages()
val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1))
@@ -128,14 +166,25 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
import org.apache.spark.sql.functions._
// scalastyle:on
- val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"),
- $"url".as("url"),
- extractDate($"crawl_date",lit("MM")).as("crawl_date"),
- explode_outer(extractLinks($"url", $"content")).as("link")
- )
- .filter(lower($"content").contains("keynote")) // filtered on keyword internet
-
- val results = interResults.select($"url", $"Domain", $"crawl_date", dest(array($"link")).as("destination_page")).head(3)
+ val interResults = df
+ .select(
+ removePrefixWWW(extractDomain($"url")).as("Domain"),
+ $"url".as("url"),
+ extractDate($"crawl_date", lit("MM")).as("crawl_date"),
+ explode_outer(extractLinks($"url", $"content")).as("link")
+ )
+ .filter(
+ lower($"content").contains("keynote")
+ ) // filtered on keyword internet
+
+ val results = interResults
+ .select(
+ $"url",
+ $"Domain",
+ $"crawl_date",
+ dest(array($"link")).as("destination_page")
+ )
+ .head(3)
assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
@@ -145,7 +194,11 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "04")
- assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html")
+ assert(
+ results(1).get(
+ 3
+ ) == "http://web.archive.org/collections/web/advanced.html"
+ )
assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
@@ -154,7 +207,8 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
}
test("Extract dates DD DF") {
- val df = RecordLoader.loadArchives(arcPath, sc)
+ val df = RecordLoader
+ .loadArchives(arcPath, sc)
.webpages()
val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1))
@@ -166,14 +220,25 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
import org.apache.spark.sql.functions._
// scalastyle:on
- val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"),
- $"url".as("url"),
- extractDate($"crawl_date",lit("DD")).as("crawl_date"),
- explode_outer(extractLinks($"url", $"content")).as("link")
- )
- .filter(lower($"content").contains("keynote")) // filtered on keyword internet
-
- val results = interResults.select($"url", $"Domain", $"crawl_date", dest(array($"link")).as("destination_page")).head(3)
+ val interResults = df
+ .select(
+ removePrefixWWW(extractDomain($"url")).as("Domain"),
+ $"url".as("url"),
+ extractDate($"crawl_date", lit("DD")).as("crawl_date"),
+ explode_outer(extractLinks($"url", $"content")).as("link")
+ )
+ .filter(
+ lower($"content").contains("keynote")
+ ) // filtered on keyword internet
+
+ val results = interResults
+ .select(
+ $"url",
+ $"Domain",
+ $"crawl_date",
+ dest(array($"link")).as("destination_page")
+ )
+ .head(3)
assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
@@ -183,7 +248,11 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "30")
- assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html")
+ assert(
+ results(1).get(
+ 3
+ ) == "http://web.archive.org/collections/web/advanced.html"
+ )
assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
@@ -192,7 +261,8 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
}
test("Extract dates YYYYMMDD DF") {
- val df = RecordLoader.loadArchives(arcPath, sc)
+ val df = RecordLoader
+ .loadArchives(arcPath, sc)
.webpages()
val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1))
@@ -204,14 +274,25 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
import org.apache.spark.sql.functions._
// scalastyle:on
- val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"),
- $"url".as("url"),
- extractDate($"crawl_date",lit("YYYYMMDD")).as("crawl_date"),
- explode_outer(extractLinks($"url", $"content")).as("link")
- )
- .filter(lower($"content").contains("keynote")) // filtered on keyword internet
-
- val results = interResults.select($"url", $"Domain", $"crawl_date", dest(array($"link")).as("destination_page")).head(3)
+ val interResults = df
+ .select(
+ removePrefixWWW(extractDomain($"url")).as("Domain"),
+ $"url".as("url"),
+ extractDate($"crawl_date", lit("YYYYMMDD")).as("crawl_date"),
+ explode_outer(extractLinks($"url", $"content")).as("link")
+ )
+ .filter(
+ lower($"content").contains("keynote")
+ ) // filtered on keyword internet
+
+ val results = interResults
+ .select(
+ $"url",
+ $"Domain",
+ $"crawl_date",
+ dest(array($"link")).as("destination_page")
+ )
+ .head(3)
assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
@@ -221,7 +302,11 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter {
assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "20080430")
- assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html")
+ assert(
+ results(1).get(
+ 3
+ ) == "http://web.archive.org/collections/web/advanced.html"
+ )
assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala
index 9bf45a60..f6b16aa1 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala
@@ -40,7 +40,8 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter {
}
test("Extract links DF") {
- val df = RecordLoader.loadArchives(arcPath, sc)
+ val df = RecordLoader
+ .loadArchives(arcPath, sc)
.webpages()
val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1))
@@ -51,14 +52,25 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter {
import spark.implicits._
// scalastyle:on
- val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"),
- $"url".as("url"),
- $"crawl_date",
- explode_outer(extractLinks($"url",$"content")).as("link")
- )
- .filter(lower($"content").contains("keynote")) // filtered on keyword internet
+ val interResults = df
+ .select(
+ removePrefixWWW(extractDomain($"url")).as("Domain"),
+ $"url".as("url"),
+ $"crawl_date",
+ explode_outer(extractLinks($"url", $"content")).as("link")
+ )
+ .filter(
+ lower($"content").contains("keynote")
+ ) // filtered on keyword internet
- val results = interResults.select($"url",$"Domain",$"crawl_date",dest(array($"link")).as("destination_page")).head(3)
+ val results = interResults
+ .select(
+ $"url",
+ $"Domain",
+ $"crawl_date",
+ dest(array($"link")).as("destination_page")
+ )
+ .head(3)
// Results should be:
// +--------------------------------+-----------+----------+----------------------------------------------------+
@@ -69,7 +81,6 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter {
// |http://www.archive.org/index.php|archive.org|20080430 |http://www.sloan.org |
// +--------------------------------+-----------+----------+----------------------------------------------------+
-
assert(results(0).get(0) == "http://www.archive.org/index.php")
assert(results(0).get(1) == "archive.org")
assert(results(0).get(2) == "20080430")
@@ -78,7 +89,11 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter {
assert(results(1).get(0) == "http://www.archive.org/index.php")
assert(results(1).get(1) == "archive.org")
assert(results(1).get(2) == "20080430")
- assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html")
+ assert(
+ results(1).get(
+ 3
+ ) == "http://web.archive.org/collections/web/advanced.html"
+ )
assert(results(2).get(0) == "http://www.archive.org/index.php")
assert(results(2).get(1) == "archive.org")
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala
index 843635ae..523ea78f 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala
@@ -39,14 +39,27 @@ class ExtractImageDetailsTest extends FunSuite with BeforeAndAfter {
}
test("Image files extraction DF") {
- val df = RecordLoader.loadArchives(arcPath, sc)
+ val df = RecordLoader
+ .loadArchives(arcPath, sc)
.images()
- val extracted = df.select("url", "mime_type_web_server", "mime_type_tika",
- "width", "height", "md5", "sha1")
- .orderBy(desc("md5")).head(2).toList
+ val extracted = df
+ .select(
+ "url",
+ "mime_type_web_server",
+ "mime_type_tika",
+ "width",
+ "height",
+ "md5",
+ "sha1"
+ )
+ .orderBy(desc("md5"))
+ .head(2)
+ .toList
assert(extracted.size == 2)
- assert("http://www.archive.org/images/mediatype_movies.gif" == extracted(0)(0))
+ assert(
+ "http://www.archive.org/images/mediatype_movies.gif" == extracted(0)(0)
+ )
assert("image/gif" == extracted(0)(1))
assert("image/gif" == extracted(0)(2))
assert(21 == extracted(0)(3))
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala
index 4fa200b2..8e199ce2 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala
@@ -39,7 +39,8 @@ class ImageLinksTest extends FunSuite with BeforeAndAfter {
}
test("Image links extraction DF") {
- val df = RecordLoader.loadArchives(arcPath, sc)
+ val df = RecordLoader
+ .loadArchives(arcPath, sc)
.imagegraph()
// We need this in order to use the $-notation
@@ -47,13 +48,24 @@ class ImageLinksTest extends FunSuite with BeforeAndAfter {
// scalastyle:off
import spark.implicits._
// scalastyle:on
- val extracted = df.select($"src".as("Domain"), $"image_url".as("Image"))
- .orderBy(desc("Image")).head(2).toList
+ val extracted = df
+ .select($"src".as("Domain"), $"image_url".as("Image"))
+ .orderBy(desc("Image"))
+ .head(2)
+ .toList
assert(extracted.size == 2)
assert("http://www.archive.org/index.php" == extracted(0)(0))
- assert("http://www.archive.org/services/get-item-image.php?identifier=zh27814&collection=zh27&mediatype=audio" == extracted(0)(1))
+ assert(
+ "http://www.archive.org/services/get-item-image.php?identifier=zh27814&collection=zh27&mediatype=audio" == extracted(
+ 0
+ )(1)
+ )
assert("http://www.archive.org/index.php" == extracted(1)(0))
- assert("http://www.archive.org/services/get-item-image.php?identifier=secretarmiesb00spivrich&collection=americana&mediatype=texts" == extracted(1)(1))
+ assert(
+ "http://www.archive.org/services/get-item-image.php?identifier=secretarmiesb00spivrich&collection=americana&mediatype=texts" == extracted(
+ 1
+ )(1)
+ )
}
after {
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala
index b17fb32a..5ffdf6d6 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala
@@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class ExtractPDFDetailsTest extends FunSuite with BeforeAndAfter {
- private val warcPath = Resources.getResource("warc/example.pdf.warc.gz").getPath
+ private val warcPath =
+ Resources.getResource("warc/example.pdf.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
@@ -39,21 +40,41 @@ class ExtractPDFDetailsTest extends FunSuite with BeforeAndAfter {
}
test("PDF files extraction DF") {
- val df = RecordLoader.loadArchives(warcPath, sc)
+ val df = RecordLoader
+ .loadArchives(warcPath, sc)
.pdfs()
- val extracted = df.select("url", "filename", "extension",
- "mime_type_web_server", "mime_type_tika", "md5")
- .orderBy(desc("md5")).head(2).toList
+ val extracted = df
+ .select(
+ "url",
+ "filename",
+ "extension",
+ "mime_type_web_server",
+ "mime_type_tika",
+ "md5"
+ )
+ .orderBy(desc("md5"))
+ .head(2)
+ .toList
assert(extracted.size == 2)
- assert("https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y" == extracted(0)(0))
+ assert(
+ "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y" == extracted(
+ 0
+ )(0)
+ )
assert("cost-analysis.pdf" == extracted(0)(1))
assert("pdf" == extracted(0)(2))
assert("application/pdf" == extracted(0)(3))
assert("application/pdf" == extracted(0)(4))
assert("aaba59d2287afd40c996488a39bbc0dd" == extracted(0)(5))
- assert("https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf?sequence=3&isAllowed=y" == extracted(1)(0))
- assert("JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf" == extracted(1)(1))
+ assert(
+ "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf?sequence=3&isAllowed=y" == extracted(
+ 1
+ )(0)
+ )
+ assert(
+ "JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf" == extracted(1)(1)
+ )
assert("pdf" == extracted(1)(2))
assert("application/pdf" == extracted(1)(3))
assert("application/pdf" == extracted(1)(4))
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala
index 4168e5a6..c09d432f 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala
@@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class PresentationProgramFilesTest extends FunSuite with BeforeAndAfter {
- private val warcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
+ private val warcPath =
+ Resources.getResource("warc/example.docs.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
@@ -39,24 +40,50 @@ class PresentationProgramFilesTest extends FunSuite with BeforeAndAfter {
}
test("Presentation program files extraction DF") {
- val df = RecordLoader.loadArchives(warcPath, sc)
+ val df = RecordLoader
+ .loadArchives(warcPath, sc)
.presentationProgramFiles()
- val extracted = df.select("url", "filename", "extension",
- "mime_type_web_server", "mime_type_tika", "md5")
- .orderBy(desc("md5")).head(2).toList
+ val extracted = df
+ .select(
+ "url",
+ "filename",
+ "extension",
+ "mime_type_web_server",
+ "mime_type_tika",
+ "md5"
+ )
+ .orderBy(desc("md5"))
+ .head(2)
+ .toList
assert(extracted.size == 2)
- assert("https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.odp" == extracted(0)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.odp" == extracted(
+ 0
+ )(0)
+ )
assert("aut-test-fixtures.odp" == extracted(0)(1))
assert("odp" == extracted(0)(2))
assert("application/vnd.oasis.opendocument.presentation" == extracted(0)(3))
assert("application/vnd.oasis.opendocument.presentation" == extracted(0)(4))
assert("f38b2679029cf3453c8151b92c615c70" == extracted(0)(5))
- assert("https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx" == extracted(1)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx" == extracted(
+ 1
+ )(0)
+ )
assert("aut-test-fixtures.pptx" == extracted(1)(1))
assert("pptx" == extracted(1)(2))
- assert("application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted(1)(3))
- assert("application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted(1)(4))
+ assert(
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted(
+ 1
+ )(3)
+ )
+ assert(
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted(
+ 1
+ )(4)
+ )
assert("7a7b1fe4b6d311376eaced9de3b682ee" == extracted(1)(5))
}
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala
index c326b021..a36c3742 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala
@@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class ExtractSpreadsheetDetailsTest extends FunSuite with BeforeAndAfter {
- private val warcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
+ private val warcPath =
+ Resources.getResource("warc/example.docs.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
@@ -39,32 +40,66 @@ class ExtractSpreadsheetDetailsTest extends FunSuite with BeforeAndAfter {
}
test("Spreadsheet files extraction DF") {
- val df = RecordLoader.loadArchives(warcPath, sc)
+ val df = RecordLoader
+ .loadArchives(warcPath, sc)
.spreadsheets()
- val extracted = df.select("url", "filename", "extension",
- "mime_type_web_server", "mime_type_tika", "md5")
- .orderBy(desc("md5")).head(4).toList
+ val extracted = df
+ .select(
+ "url",
+ "filename",
+ "extension",
+ "mime_type_web_server",
+ "mime_type_tika",
+ "md5"
+ )
+ .orderBy(desc("md5"))
+ .head(4)
+ .toList
assert(extracted.size == 4)
- assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.xlsx" == extracted(0)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.xlsx" == extracted(
+ 0
+ )(0)
+ )
assert("test-aut-fixture.xlsx" == extracted(0)(1))
assert("xlsx" == extracted(0)(2))
- assert("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" == extracted(0)(3))
- assert("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" == extracted(0)(4))
+ assert(
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" == extracted(
+ 0
+ )(3)
+ )
+ assert(
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" == extracted(
+ 0
+ )(4)
+ )
assert("befb3304cb592e0761509bf626171071" == extracted(0)(5))
- assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture%20-%20Sheet1.tsv" == extracted(1)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture%20-%20Sheet1.tsv" == extracted(
+ 1
+ )(0)
+ )
assert("test-aut-fixture%20-%20Sheet1.tsv" == extracted(1)(1))
assert("tsv" == extracted(1)(2))
assert("text/tab-separated-values" == extracted(1)(3))
assert("text/plain" == extracted(1)(4))
assert("8ce6e9489c1c1129cca0e3f1eb8206ce" == extracted(1)(5))
- assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods" == extracted(2)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods" == extracted(
+ 2
+ )(0)
+ )
assert("test-aut-fixture.ods" == extracted(2)(1))
assert("ods" == extracted(2)(2))
assert("application/vnd.oasis.opendocument.spreadsheet" == extracted(2)(3))
assert("application/vnd.oasis.opendocument.spreadsheet" == extracted(2)(4))
assert("7f70280757d8beb2d1bfd6fb1b6ae6e9" == extracted(2)(5))
- assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture%20-%20Sheet1.csv" == extracted(3)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture%20-%20Sheet1.csv" == extracted(
+ 3
+ )(0)
+ )
assert("test-aut-fixture%20-%20Sheet1.csv" == extracted(3)(1))
assert("csv" == extracted(3)(2))
assert("text/csv" == extracted(3)(3))
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala
index be1048b0..b0bf7abd 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala
@@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class VideoTest extends FunSuite with BeforeAndAfter {
- private val warcPath = Resources.getResource("warc/example.media.warc.gz").getPath
+ private val warcPath =
+ Resources.getResource("warc/example.media.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
@@ -39,12 +40,22 @@ class VideoTest extends FunSuite with BeforeAndAfter {
}
test("Video files extraction DF") {
- val df = RecordLoader.loadArchives(warcPath, sc)
+ val df = RecordLoader
+ .loadArchives(warcPath, sc)
.videos()
- val extracted = df.select("url", "filename", "extension",
- "mime_type_web_server", "mime_type_tika", "md5")
- .orderBy(desc("md5")).head(1).toList
+ val extracted = df
+ .select(
+ "url",
+ "filename",
+ "extension",
+ "mime_type_web_server",
+ "mime_type_tika",
+ "md5"
+ )
+ .orderBy(desc("md5"))
+ .head(1)
+ .toList
assert(extracted.size == 1)
assert("https://ruebot.net/2018-11-12%2016.14.11.mp4" == extracted(0)(0))
assert("2018-11-12%2016.14.11.mp4" == extracted(0)(1))
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala
index a9d474de..bc3e9100 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala
@@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class WordProcessorFilesTest extends FunSuite with BeforeAndAfter {
- private val warcPath = Resources.getResource("warc/example.docs.warc.gz").getPath
+ private val warcPath =
+ Resources.getResource("warc/example.docs.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
@@ -39,30 +40,60 @@ class WordProcessorFilesTest extends FunSuite with BeforeAndAfter {
}
test("Word processor files extraction DF") {
- val df = RecordLoader.loadArchives(warcPath, sc)
+ val df = RecordLoader
+ .loadArchives(warcPath, sc)
.wordProcessorFiles()
- val extracted = df.select("url", "filename", "extension",
- "mime_type_web_server", "mime_type_tika", "md5")
- .orderBy(desc("md5")).head(3).toList
+ val extracted = df
+ .select(
+ "url",
+ "filename",
+ "extension",
+ "mime_type_web_server",
+ "mime_type_tika",
+ "md5"
+ )
+ .orderBy(desc("md5"))
+ .head(3)
+ .toList
assert(extracted.size == 3)
- assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf" == extracted(0)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf" == extracted(
+ 0
+ )(0)
+ )
assert("test-aut-fixtures.rtf" == extracted(0)(1))
assert("rtf" == extracted(0)(2))
assert("application/rtf" == extracted(0)(3))
assert("application/rtf" == extracted(0)(4))
assert("e483512b65ba44d71e843c57de2adeb7" == extracted(0)(5))
- assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.odt" == extracted(1)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.odt" == extracted(
+ 1
+ )(0)
+ )
assert("test-aut-fixtures.odt" == extracted(1)(1))
assert("odt" == extracted(1)(2))
assert("application/vnd.oasis.opendocument.text" == extracted(1)(3))
assert("application/vnd.oasis.opendocument.text" == extracted(1)(4))
assert("9ef1aaee5c18cd16c47e75aaa38bd393" == extracted(1)(5))
- assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.docx" == extracted(2)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.docx" == extracted(
+ 2
+ )(0)
+ )
assert("test-aut-fixtures.docx" == extracted(2)(1))
assert("docx" == extracted(2)(2))
- assert("application/vnd.openxmlformats-officedocument.wordprocessingml.document" == extracted(2)(3))
- assert("application/vnd.openxmlformats-officedocument.wordprocessingml.document" == extracted(2)(4))
+ assert(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" == extracted(
+ 2
+ )(3)
+ )
+ assert(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" == extracted(
+ 2
+ )(4)
+ )
assert("51040165e60629c6bf63c2bd40b9e628" == extracted(2)(5))
}
diff --git a/src/test/scala/io/archivesunleashed/df/SaveMediaBytesTest.scala b/src/test/scala/io/archivesunleashed/df/SaveMediaBytesTest.scala
index 6ea5203c..f67332ac 100644
--- a/src/test/scala/io/archivesunleashed/df/SaveMediaBytesTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/SaveMediaBytesTest.scala
@@ -30,12 +30,18 @@ import java.io.File
import java.nio.file.{Paths, Files}
import java.util.Base64
-case class TestMediaDetails(url: String, extension: String, mime_type: String,
- md5: String, bytes: String)
+case class TestMediaDetails(
+ url: String,
+ extension: String,
+ mime_type: String,
+ md5: String,
+ bytes: String
+)
@RunWith(classOf[JUnitRunner])
class SaveMediaBytesTest extends FunSuite with BeforeAndAfter {
- private val warcPath = Resources.getResource("warc/example.media.warc.gz").getPath
+ private val warcPath =
+ Resources.getResource("warc/example.media.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
@@ -50,11 +56,14 @@ class SaveMediaBytesTest extends FunSuite with BeforeAndAfter {
}
test("Save audio bytes to disk DF") {
- val df = RecordLoader.loadArchives(warcPath, sc)
+ val df = RecordLoader
+ .loadArchives(warcPath, sc)
.audio()
- val extracted = df.select(testString, testExtension)
- .orderBy(desc(testString)).limit(1)
+ val extracted = df
+ .select(testString, testExtension)
+ .orderBy(desc(testString))
+ .limit(1)
extracted.saveToDisk(testString, "/tmp/audio", testExtension)
val encodedBytes: String = extracted.take(1)(0).getAs(testString)
@@ -68,11 +77,19 @@ class SaveMediaBytesTest extends FunSuite with BeforeAndAfter {
}
test("Attempt to save invalid audio DF") {
- val dummyEncBytes = Base64.getEncoder.encodeToString(Array.range(0, 127)
- .map(_.toByte))
+ val dummyEncBytes = Base64.getEncoder.encodeToString(
+ Array
+ .range(0, 127)
+ .map(_.toByte)
+ )
val dummyMD5 = ComputeMD5(dummyEncBytes.getBytes)
- val dummyAudio = TestMediaDetails("http://example.com/fake.mp3", "mp3",
- "audio/mpeg", dummyMD5, dummyEncBytes)
+ val dummyAudio = TestMediaDetails(
+ "http://example.com/fake.mp3",
+ "mp3",
+ "audio/mpeg",
+ dummyMD5,
+ dummyEncBytes
+ )
// For toDF().
val spark = SparkSession.builder().master("local").getOrCreate()
@@ -84,8 +101,12 @@ class SaveMediaBytesTest extends FunSuite with BeforeAndAfter {
df.saveToDisk(testString, "/tmp/bar", "extension")
// Check that no file was written.
- assert(new File("/tmp").listFiles.filter(_.isFile).toList
- .count(_.getName.startsWith("bar-" + dummyMD5)) == 0)
+ assert(
+ new File("/tmp").listFiles
+ .filter(_.isFile)
+ .toList
+ .count(_.getName.startsWith("bar-" + dummyMD5)) == 0
+ )
}
after {
diff --git a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala
index d9309f7a..b39da1d9 100644
--- a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala
@@ -40,7 +40,8 @@ class SimpleDfTest extends FunSuite with BeforeAndAfter {
}
test("Count records DF") {
- val df = RecordLoader.loadArchives(arcPath, sc)
+ val df = RecordLoader
+ .loadArchives(arcPath, sc)
.webpages()
// We need this in order to use the $-notation
@@ -49,8 +50,12 @@ class SimpleDfTest extends FunSuite with BeforeAndAfter {
import spark.implicits._
// scalastyle:on
- val results = df.select(extractDomain($"Url").as("Domain"))
- .groupBy("Domain").count().orderBy(desc("count")).head(3)
+ val results = df
+ .select(extractDomain($"Url").as("Domain"))
+ .groupBy("Domain")
+ .count()
+ .orderBy(desc("count"))
+ .head(3)
// Results should be:
// +------------------+-----+
diff --git a/src/test/scala/io/archivesunleashed/df/UdfsTests.scala b/src/test/scala/io/archivesunleashed/df/UdfsTests.scala
index a5044984..8a9c65ce 100644
--- a/src/test/scala/io/archivesunleashed/df/UdfsTests.scala
+++ b/src/test/scala/io/archivesunleashed/df/UdfsTests.scala
@@ -17,7 +17,13 @@
package io.archivesunleashed
import com.google.common.io.Resources
-import io.archivesunleashed.udfs.{computeImageSize, computeMD5, computeSHA1, extractImageLinks, getExtensionMime}
+import io.archivesunleashed.udfs.{
+ computeImageSize,
+ computeMD5,
+ computeSHA1,
+ extractImageLinks,
+ getExtensionMime
+}
import org.apache.spark.sql.functions.{desc, explode, unbase64}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
@@ -39,8 +45,11 @@ class UdfsTest extends FunSuite with BeforeAndAfter {
sc = new SparkContext(conf)
}
- test("DF Udf tests; computeSHA1, computeMD5, extractImageLinks, getExtensionMime") {
- val df = RecordLoader.loadArchives(arcPath, sc)
+ test(
+ "DF Udf tests; computeSHA1, computeMD5, extractImageLinks, getExtensionMime"
+ ) {
+ val df = RecordLoader
+ .loadArchives(arcPath, sc)
.webpages()
// We need this in order to use the $-notation
@@ -49,33 +58,57 @@ class UdfsTest extends FunSuite with BeforeAndAfter {
import spark.implicits._
// scalastyle:on
- val extracted = df.select($"url", $"mime_type_web_server", $"mime_type_tika",
- computeSHA1($"content").as("sha1_test"),
- computeMD5($"content").as("md5_test"),
- explode(extractImageLinks($"url", $"content")).as("image_link"),
- getExtensionMime($"url", $"mime_type_tika").as("extension"))
- .orderBy(desc("md5_test")).head(4).toList
+ val extracted = df
+ .select(
+ $"url",
+ $"mime_type_web_server",
+ $"mime_type_tika",
+ computeSHA1($"content").as("sha1_test"),
+ computeMD5($"content").as("md5_test"),
+ explode(extractImageLinks($"url", $"content")).as("image_link"),
+ getExtensionMime($"url", $"mime_type_tika").as("extension")
+ )
+ .orderBy(desc("md5_test"))
+ .head(4)
+ .toList
assert(extracted.size == 4)
- assert(extracted(0).get(0) == "http://www.archive.org/iathreads/post-view.php?id=186011")
+ assert(
+ extracted(0).get(
+ 0
+ ) == "http://www.archive.org/iathreads/post-view.php?id=186011"
+ )
assert(extracted(0).get(1) == "text/html")
assert(extracted(0).get(2) == "text/html")
assert(extracted(0).get(3) == "9b9cd08e300f49ae59b1f2ced1bcd43fa8b5418c")
assert(extracted(0).get(4) == "ff14be99e72943e85fe2368c1e65127a")
- assert(extracted(0).get(5).toString == "[http://www.archive.org/iathreads/post-view.php?id=186011,http://www.archive.org/images/logo.jpg,(logo)]")
+ assert(
+ extracted(0)
+ .get(5)
+ .toString == "[http://www.archive.org/iathreads/post-view.php?id=186011,http://www.archive.org/images/logo.jpg,(logo)]"
+ )
assert(extracted(0).get(6) == "html")
- assert(extracted(3).get(0) == "http://www.archive.org/iathreads/forum-display.php?poster=RipJarvis")
+ assert(
+ extracted(3).get(
+ 0
+ ) == "http://www.archive.org/iathreads/forum-display.php?poster=RipJarvis"
+ )
assert(extracted(3).get(1) == "text/html")
assert(extracted(3).get(2) == "text/html")
assert(extracted(3).get(3) == "284a847892deaeb7790fe1b4123a9ccb47a246ed")
assert(extracted(3).get(4) == "fe0c87b4db0ae846924c56f389083f39")
- assert(extracted(3).get(5).toString == "[http://www.archive.org/iathreads/forum-display.php?poster=RipJarvis,http://www.archive.org/images/logo.jpg,(logo)]")
+ assert(
+ extracted(3)
+ .get(5)
+ .toString == "[http://www.archive.org/iathreads/forum-display.php?poster=RipJarvis,http://www.archive.org/images/logo.jpg,(logo)]"
+ )
assert(extracted(3).get(6) == "html")
}
test("DF Udf tests; computeImageSize, computeSHA1, computeMD5") {
- val df = RecordLoader.loadArchives(arcPath, sc)
+ val df = RecordLoader
+ .loadArchives(arcPath, sc)
.images()
// We need this in order to use the $-notation
@@ -84,13 +117,21 @@ class UdfsTest extends FunSuite with BeforeAndAfter {
import spark.implicits._
// scalastyle:on
- val extracted = df.select($"md5", $"sha1", $"height", $"width",
- computeImageSize(unbase64($"bytes")).as("image_size"),
- computeSHA1(unbase64($"bytes")).as("sha1_test"),
- computeMD5(unbase64($"bytes")).as("md5_test"))
- .withColumn("img_width", $"image_size._1")
- .withColumn("img_height", $"image_size._2")
- .orderBy(desc("md5")).head(2).toList
+ val extracted = df
+ .select(
+ $"md5",
+ $"sha1",
+ $"height",
+ $"width",
+ computeImageSize(unbase64($"bytes")).as("image_size"),
+ computeSHA1(unbase64($"bytes")).as("sha1_test"),
+ computeMD5(unbase64($"bytes")).as("md5_test")
+ )
+ .withColumn("img_width", $"image_size._1")
+ .withColumn("img_height", $"image_size._2")
+ .orderBy(desc("md5"))
+ .head(2)
+ .toList
assert(extracted.size == 2)
assert(extracted(0).get(0) == "ff05f9b408519079c992202e8c8a14ee")
diff --git a/src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala
index 3b7981ca..ef1d5a9f 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala
@@ -30,17 +30,18 @@ import org.scalatest.junit.JUnitRunner
class ComputeImageSizeTest extends FunSuite {
val testImageSize = 10
var ios: ByteArrayOutputStream = new ByteArrayOutputStream();
- val img = new BufferedImage(testImageSize, testImageSize, BufferedImage.TYPE_INT_RGB)
+ val img =
+ new BufferedImage(testImageSize, testImageSize, BufferedImage.TYPE_INT_RGB)
ImageIO.write(img, "png", ios)
ios.flush()
var image: Array[Byte] = ios.toByteArray();
ios.close()
- test ("Check images and provide size RDD") {
+ test("Check images and provide size RDD") {
val imageSize = (10, 10)
val emptyImageSize = (0, 0)
assert(ComputeImageSize(image) == imageSize)
- assert(ComputeImageSize(Array[Byte](0,0,0)) == emptyImageSize)
+ assert(ComputeImageSize(Array[Byte](0, 0, 0)) == emptyImageSize)
// scalastyle:off null
assert(ComputeImageSize(null) == emptyImageSize)
// scalastyle:on null
diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala
index 1f9a90bc..c036df3a 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala
@@ -25,8 +25,8 @@ import org.scalatest.junit.JUnitRunner
@RunWith(classOf[JUnitRunner])
class ExtractBoilerPipeTextTest extends FunSuite {
val header = "HTTP/1.0 200 OK Content-Type: text/html;" +
- "charset=UTF-8 Expires: Fri, 20 Jul 2018 19:09:28 GMT Date:" +
- "Fri, 20 Jul 2018 19:09:28 GMT Cache-Control: private,;\r\n\r\n"
+ "charset=UTF-8 Expires: Fri, 20 Jul 2018 19:09:28 GMT Date:" +
+ "Fri, 20 Jul 2018 19:09:28 GMT Cache-Control: private,;\r\n\r\n"
var text = """Text with a boiler plate.
"""
var boiler = """Copyright 2017"""
diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala
index f2cffc4d..24c8f375 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala
@@ -16,7 +16,13 @@
package io.archivesunleashed.matchbox
-import io.archivesunleashed.matchbox.ExtractDate.DateComponent.{DD, MM, YYYY, YYYYMM, YYYYMMDD}
+import io.archivesunleashed.matchbox.ExtractDate.DateComponent.{
+ DD,
+ MM,
+ YYYY,
+ YYYYMM,
+ YYYYMMDD
+}
import org.junit.runner.RunWith
import org.scalatest.FunSuite
import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala
index 077a69e9..fb0af680 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala
@@ -28,19 +28,34 @@ class ExtractDomainTest extends FunSuite {
private val lintool = "https://github.com/lintool"
private val github = "github.com"
- private val data1: Seq[(String, String)] = Seq.newBuilder.+=(
- (jimmylin, umiacs),
- (lintool, github),
- ("http://ianmilligan.ca/2015/05/04/iipc-2015-slides-for-warcs-wats-and-wgets-presentation/", "ianmilligan.ca"),
- (index, "")).result()
+ private val data1: Seq[(String, String)] = Seq.newBuilder
+ .+=(
+ (jimmylin, umiacs),
+ (lintool, github),
+ (
+ "http://ianmilligan.ca/2015/05/04/iipc-2015-slides-for-warcs-wats-and-wgets-presentation/",
+ "ianmilligan.ca"
+ ),
+ (index, "")
+ )
+ .result()
- private val data2 = Seq.newBuilder.+=(
- (index, jimmylin, umiacs),
- (lintool, jimmylin, github),
- (index, lintool, github)).result()
+ private val data2 = Seq.newBuilder
+ .+=(
+ (index, jimmylin, umiacs),
+ (lintool, jimmylin, github),
+ (index, lintool, github)
+ )
+ .result()
- private val data3 = Seq.newBuilder.+=(
- ("http://www.seetorontonow.canada-booknow.com\\booking_results.php", "www.seetorontonow.canada-booknow.com")).result()
+ private val data3 = Seq.newBuilder
+ .+=(
+ (
+ "http://www.seetorontonow.canada-booknow.com\\booking_results.php",
+ "www.seetorontonow.canada-booknow.com"
+ )
+ )
+ .result()
test("Extract simple domain extraction RDD") {
data1.foreach {
diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala
index 6c3ec63d..938b062c 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala
@@ -27,7 +27,8 @@ class ExtractImageLinksTest extends FunSuite {
test("Extract simple image links RDD") {
val fragment: String =
"""Image here: and another """
- val extracted: Seq[(String, String, String)] = ExtractImageLinks("", fragment)
+ val extracted: Seq[(String, String, String)] =
+ ExtractImageLinks("", fragment)
assert(extracted.size == 2)
assert("http://foo.bar.com/pic.png" == extracted(0)._2)
assert("picture" == extracted(0)._3)
@@ -38,7 +39,8 @@ class ExtractImageLinksTest extends FunSuite {
test("Extract relative image links RDD") {
val fragment: String =
"""Image here: and another and """
- val extracted: Seq[(String, String, String)] = ExtractImageLinks("http://foo.bar.com/a/page.html", fragment)
+ val extracted: Seq[(String, String, String)] =
+ ExtractImageLinks("http://foo.bar.com/a/page.html", fragment)
assert(extracted.size == 3)
assert("http://foo.bar.com/a/pic.png" == extracted(0)._2)
assert("picture" == extracted(0)._3)
diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala
index b2b8e339..e69321ba 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala
@@ -28,8 +28,9 @@ import scala.collection.mutable
@RunWith(classOf[JUnitRunner])
class ExtractLinksTest extends FunSuite {
- val fragment: String = "Here is a search engine.\n" +
- "Here is Twitter.\n"
+ val fragment: String =
+ "Here is a search engine.\n" +
+ "Here is Twitter.\n"
val fooFragment: String = "http://www.foobar.org/index.html"
val url = "http://www.google.com"
val twitter = "http://www.twitter.com/"
@@ -46,9 +47,10 @@ class ExtractLinksTest extends FunSuite {
test("Extract relative links RDD") {
val fragmentLocal: String = "Here is " +
- "a search engine.\nHere is a a relative URL.\n"
+ "a search engine.\nHere is a a relative URL.\n"
val fooFragmentLocal = "http://www.foobar.org/page.html"
- val extracted: Seq[(String, String, String)] = ExtractLinks("", fragmentLocal, fooFragment)
+ val extracted: Seq[(String, String, String)] =
+ ExtractLinks("", fragmentLocal, fooFragment)
assert(extracted.size == 2)
assert(url == extracted.head._2)
assert(head == extracted.head._3)
@@ -58,10 +60,17 @@ class ExtractLinksTest extends FunSuite {
test("Test link errors RDD") {
val bytes: Array[Byte] = "wronglyTyped".getBytes()
- val invalid: String = "Here is a fake url bogus search engine."
+ val invalid: String =
+ "Here is a fake url bogus search engine."
// scalastyle:off null
- assert(ExtractLinks(null, fragment, fooFragment) == mutable.MutableList[(String, String, String)]())
+ assert(
+ ExtractLinks(null, fragment, fooFragment) == mutable
+ .MutableList[(String, String, String)]()
+ )
// scalastyle:on null
- assert(ExtractLinks("", "", fooFragment) == mutable.MutableList[(String, String, String)]())
+ assert(
+ ExtractLinks("", "", fooFragment) == mutable
+ .MutableList[(String, String, String)]()
+ )
}
}
diff --git a/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala b/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala
index 55fa8520..66f64454 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala
@@ -27,7 +27,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
@RunWith(classOf[JUnitRunner])
class GetExtensionMIMETest extends FunSuite with BeforeAndAfter {
- private val warcPath = Resources.getResource("warc/example.media.warc.gz").getPath
+ private val warcPath =
+ Resources.getResource("warc/example.media.warc.gz").getPath
private val master = "local[4]"
private val appName = "example-df"
private var sc: SparkContext = _
@@ -42,14 +43,28 @@ class GetExtensionMIMETest extends FunSuite with BeforeAndAfter {
}
test("Get extension of file from URL with no extension") {
- df = RecordLoader.loadArchives(warcPath, sc)
+ df = RecordLoader
+ .loadArchives(warcPath, sc)
.images()
- extracted = df.select("url", "filename", "extension",
- "mime_type_web_server", "mime_type_tika", "md5")
- .orderBy(desc("md5")).head(3).toList
+ extracted = df
+ .select(
+ "url",
+ "filename",
+ "extension",
+ "mime_type_web_server",
+ "mime_type_tika",
+ "md5"
+ )
+ .orderBy(desc("md5"))
+ .head(3)
+ .toList
assert(extracted.size == 3)
- assert("https://ruebot.net/files/aut-test-fixtures/this_is_a_gif" == extracted(0)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/this_is_a_gif" == extracted(
+ 0
+ )(0)
+ )
assert("this_is_a_gif" == extracted(0)(1))
assert("gif" == extracted(0)(2))
assert("unknown" == extracted(0)(3))
@@ -58,7 +73,11 @@ class GetExtensionMIMETest extends FunSuite with BeforeAndAfter {
}
test("Get extension of file from URL with correct extension") {
- assert("https://ruebot.net/files/aut-test-fixtures/real_png.png" == extracted(1)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/real_png.png" == extracted(1)(
+ 0
+ )
+ )
assert("real_png.png" == extracted(1)(1))
assert("png" == extracted(1)(2))
assert("image/png" == extracted(1)(3))
@@ -67,7 +86,11 @@ class GetExtensionMIMETest extends FunSuite with BeforeAndAfter {
}
test("Get extension of file from URL with incorrect extension") {
- assert("https://ruebot.net/files/aut-test-fixtures/this_is_a_jpeg.mp3" == extracted(2)(0))
+ assert(
+ "https://ruebot.net/files/aut-test-fixtures/this_is_a_jpeg.mp3" == extracted(
+ 2
+ )(0)
+ )
assert("this_is_a_jpeg.mp3" == extracted(2)(1))
assert("jpg" == extracted(2)(2))
assert("audio/mpeg" == extracted(2)(3))
diff --git a/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala b/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala
index 6d2d81d0..7635a867 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala
@@ -39,17 +39,19 @@ class StringUtilsTest extends FunSuite {
val except: String = null;
// scalastyle:on null
assert(invalid.escapeInvalidXML() == "A<B>C&D"");
- val caught = intercept[IOException] {except.escapeInvalidXML()}
- assert (caught.getMessage == "Caught exception processing input row ");
+ val caught = intercept[IOException] { except.escapeInvalidXML() }
+ assert(caught.getMessage == "Caught exception processing input row ");
}
- test ("MD5 hash") {
+ test("MD5 hash") {
val s: String = "unesco.org";
assert(ComputeMD5(s.getBytes) == "8e8decc8e8107bcf9d3896f3222b77d8");
}
- test ("SHA1 hash") {
+ test("SHA1 hash") {
val s: String = "unesco.org";
- assert(ComputeSHA1(s.getBytes) == "2d0e5377157172045d87befe46e157cda42c4f6e");
+ assert(
+ ComputeSHA1(s.getBytes) == "2d0e5377157172045d87befe46e157cda42c4f6e"
+ );
}
}