diff --git a/.github/workflows/scalaformatter.yml b/.github/workflows/scalaformatter.yml new file mode 100644 index 00000000..df4241d6 --- /dev/null +++ b/.github/workflows/scalaformatter.yml @@ -0,0 +1,16 @@ +name: Run scalafmt + +on: + push: + branches: + - master + +jobs: + scalafmt-lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v1 + with: + fetch-depth: 1 + - name: Running scalafmt + uses: openlawteam/scalafmt-ci@v2 diff --git a/src/main/scala/io/archivesunleashed/ArchiveRecord.scala b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala index af028d0e..ea995022 100644 --- a/src/main/scala/io/archivesunleashed/ArchiveRecord.scala +++ b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala @@ -20,8 +20,17 @@ import java.io.ByteArrayInputStream import java.security.MessageDigest import java.text.SimpleDateFormat -import io.archivesunleashed.data.{ArcRecordUtils, WarcRecordUtils, ArchiveRecordWritable} -import io.archivesunleashed.matchbox.{ComputeMD5, ExtractDate, ExtractDomain, RemoveHTTPHeader} +import io.archivesunleashed.data.{ + ArcRecordUtils, + WarcRecordUtils, + ArchiveRecordWritable +} +import io.archivesunleashed.matchbox.{ + ComputeMD5, + ExtractDate, + ExtractDomain, + RemoveHTTPHeader +} import org.apache.commons.httpclient.{Header, HttpParser, StatusLine} import org.apache.spark.SerializableWritable import org.archive.io.arc.ARCRecord @@ -31,6 +40,7 @@ import scala.util.Try /** Trait for a record in a web archive. */ trait ArchiveRecord extends Serializable { + /** Returns the full path or url containing the Archive Records. */ def getArchiveFilename: String @@ -66,16 +76,17 @@ trait ArchiveRecord extends Serializable { } /** Default implementation of a record in a web archive. - * - * @constructor an archive record. - * @param r the serialized record - */ -class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends ArchiveRecord { + * + * @constructor an archive record. + * @param r the serialized record + */ +class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) + extends ArchiveRecord { val recordFormat = r.t.getFormat val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX") val getArchiveFilename: String = { - if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC){ + if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getReaderIdentifier() } else { r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getReaderIdentifier() @@ -83,32 +94,43 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends } val getCrawlDate: String = { - if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC){ - ExtractDate(r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getDate, - ExtractDate.DateComponent.YYYYMMDD) + if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { + ExtractDate( + r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getDate, + ExtractDate.DateComponent.YYYYMMDD + ) } else { ExtractDate( ArchiveUtils.get14DigitDate( - ISO8601.parse(r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getDate)), - ExtractDate.DateComponent.YYYYMMDD) + ISO8601.parse( + r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getDate + ) + ), + ExtractDate.DateComponent.YYYYMMDD + ) } } val getCrawlMonth: String = { if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { - ExtractDate(r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getDate, - ExtractDate.DateComponent.YYYYMM) + ExtractDate( + r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getDate, + ExtractDate.DateComponent.YYYYMM + ) } else { ExtractDate( ArchiveUtils.get14DigitDate( - ISO8601.parse(r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getDate)), - ExtractDate.DateComponent.YYYYMM) + ISO8601.parse( + r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getDate + ) + ), + ExtractDate.DateComponent.YYYYMM + ) } } val getContentBytes: Array[Byte] = { - if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) - { + if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { ArcRecordUtils.getContent(r.t.getRecord.asInstanceOf[ARCRecord]) } else { WarcRecordUtils.getContent(r.t.getRecord.asInstanceOf[WARCRecord]) @@ -121,9 +143,11 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends val getMimeType: String = { if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { - Option(r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getMimetype).getOrElse("unknown") + Option(r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getMimetype) + .getOrElse("unknown") } else { - Option(WarcRecordUtils.getWarcResponseMimeType(getContentBytes)).getOrElse("unknown") + Option(WarcRecordUtils.getWarcResponseMimeType(getContentBytes)) + .getOrElse("unknown") } } @@ -137,14 +161,19 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends val getHttpStatus: String = { if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { - Option(r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getStatusCode).getOrElse("000") + Option(r.t.getRecord.asInstanceOf[ARCRecord].getMetaData.getStatusCode) + .getOrElse("000") } else { - Try(new StatusLine(new String(HttpParser.readRawLine - (new ByteArrayInputStream(getContentBytes)))) - .getStatusCode).toOption match { - case Some(x) => x.toString - case None => "000" - } + Try( + new StatusLine( + new String( + HttpParser.readRawLine(new ByteArrayInputStream(getContentBytes)) + ) + ).getStatusCode + ).toOption match { + case Some(x) => x.toString + case None => "000" + } } } @@ -156,17 +185,27 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends if (getContentString.startsWith("HTTP/")) { getContentBytes.slice( getContentString.indexOf(RemoveHTTPHeader.headerEnd) - + RemoveHTTPHeader.headerEnd.length, getContentBytes.length) + + RemoveHTTPHeader.headerEnd.length, + getContentBytes.length + ) } else { getContentBytes } } val getPayloadDigest: String = { - if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC){ - "sha1:" + MessageDigest.getInstance("SHA1").digest(getContentBytes).map("%02x".format(_)).mkString + if (recordFormat == ArchiveRecordWritable.ArchiveFormat.ARC) { + "sha1:" + MessageDigest + .getInstance("SHA1") + .digest(getContentBytes) + .map("%02x".format(_)) + .mkString } else { - r.t.getRecord.asInstanceOf[WARCRecord].getHeader.getHeaderValue("WARC-Payload-Digest").asInstanceOf[String] + r.t.getRecord + .asInstanceOf[WARCRecord] + .getHeader + .getHeaderValue("WARC-Payload-Digest") + .asInstanceOf[String] } } } diff --git a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala index d0c19926..c162b51a 100644 --- a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala @@ -20,6 +20,7 @@ import io.archivesunleashed.ArchiveRecord import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object AudioInformationExtractor { + /** Extract information about audio files from web archive using * DataFrame and Spark SQL. * @@ -32,12 +33,14 @@ object AudioInformationExtractor { // scalastyle:off import spark.implicits._ // scalastyle:on - d.select($"url", - $"filename", - $"extension", - $"mime_type_web_server", - $"mime_type_tika", - $"md5", - $"sha1") + d.select( + $"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1" + ) } } diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala index e471b382..6c9cfa5b 100644 --- a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala +++ b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala @@ -71,12 +71,13 @@ class CmdAppConf(args: Seq[String]) extends ScallopConf(args) { * @param e exception that Scallop throws */ // scalastyle:off regex - override def onError(e: Throwable): Unit = e match { - case ScallopException(message) => - println(message) - throw new IllegalArgumentException() - case other: Any => throw other - } + override def onError(e: Throwable): Unit = + e match { + case ScallopException(message) => + println(message) + throw new IllegalArgumentException() + case other: Any => throw other + } // scalastyle:on regex mainOptions = Seq(input, output) @@ -84,7 +85,8 @@ class CmdAppConf(args: Seq[String]) extends ScallopConf(args) { val input = opt[List[String]](descr = "input file path", required = true) val output = opt[String](descr = "output directory path", required = true) val outputFormat = opt[String](descr = - "output format for DomainGraphExtractor, one of csv, gexf, or graphml") + "output format for DomainGraphExtractor, one of csv, gexf, or graphml" + ) val split = opt[Boolean]() val partition = opt[Int]() verify() @@ -99,7 +101,7 @@ class CommandLineApp(conf: CmdAppConf) { private val logger = Logger.getLogger(getClass().getName()) private val configuration = conf private var saveTarget = "" - private var sparkCtx : Option[SparkContext] = None + private var sparkCtx: Option[SparkContext] = None /** Maps extractor type string to DataFrame Extractors. * @@ -112,11 +114,15 @@ class CommandLineApp(conf: CmdAppConf) { private val extractors = Map[String, List[String] => Any]( "AudioInformationExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).audio() + var df = + RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).audio() inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).audio()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(AudioInformationExtractor(df)) } else { saveCsv(AudioInformationExtractor(df)) @@ -124,11 +130,15 @@ class CommandLineApp(conf: CmdAppConf) { }), "DomainFrequencyExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages() + var df = + RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages() inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webpages()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(DomainFrequencyExtractor(df)) } else { saveCsv(DomainFrequencyExtractor(df)) @@ -136,29 +146,49 @@ class CommandLineApp(conf: CmdAppConf) { }), "DomainGraphExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webgraph() + var df = + RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webgraph() inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webgraph()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "gexf") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "gexf" + ) { new File(saveTarget).mkdirs() - WriteGEXF(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GEXF.gexf") - } else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + WriteGEXF( + DomainGraphExtractor(df).collect(), + Paths.get(saveTarget).toString + "/GEXF.gexf" + ) + } else if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(DomainGraphExtractor(df)) - } else if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "graphml") { + } else if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "graphml" + ) { new File(saveTarget).mkdirs() - WriteGraphML(DomainGraphExtractor(df).collect(), Paths.get(saveTarget).toString + "/GRAPHML.graphml") + WriteGraphML( + DomainGraphExtractor(df).collect(), + Paths.get(saveTarget).toString + "/GRAPHML.graphml" + ) } else { saveCsv(DomainGraphExtractor(df)) } }), "ImageInformationExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).images() + var df = + RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).images() inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).images()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(ImageInformationExtractor(df)) } else { saveCsv(ImageInformationExtractor(df)) @@ -166,11 +196,15 @@ class CommandLineApp(conf: CmdAppConf) { }), "ImageGraphExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).imagegraph() + var df = + RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).imagegraph() inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).imagegraph()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(ImageGraphExtractor(df)) } else { saveCsv(ImageGraphExtractor(df)) @@ -182,7 +216,10 @@ class CommandLineApp(conf: CmdAppConf) { inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).pdfs()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(PDFInformationExtractor(df)) } else { saveCsv(PDFInformationExtractor(df)) @@ -190,11 +227,15 @@ class CommandLineApp(conf: CmdAppConf) { }), "PlainTextExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages() + var df = + RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages() inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webpages()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(PlainTextExtractor(df)) } else { saveCsv(PlainTextExtractor(df)) @@ -202,11 +243,20 @@ class CommandLineApp(conf: CmdAppConf) { }), "PresentationProgramInformationExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).presentationProgramFiles() + var df = RecordLoader + .loadArchives(inputFiles.head, sparkCtx.get) + .presentationProgramFiles() inputFiles.tail foreach { f => - df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).presentationProgramFiles()) + df = df.union( + RecordLoader + .loadArchives(f, sparkCtx.get) + .presentationProgramFiles() + ) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(PresentationProgramInformationExtractor(df)) } else { saveCsv(PresentationProgramInformationExtractor(df)) @@ -214,11 +264,17 @@ class CommandLineApp(conf: CmdAppConf) { }), "SpreadsheetInformationExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).spreadsheets() + var df = RecordLoader + .loadArchives(inputFiles.head, sparkCtx.get) + .spreadsheets() inputFiles.tail foreach { f => - df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).spreadsheets()) + df = + df.union(RecordLoader.loadArchives(f, sparkCtx.get).spreadsheets()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(SpreadsheetInformationExtractor(df)) } else { saveCsv(SpreadsheetInformationExtractor(df)) @@ -226,11 +282,15 @@ class CommandLineApp(conf: CmdAppConf) { }), "VideoInformationExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).videos() + var df = + RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).videos() inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).videos()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(VideoInformationExtractor(df)) } else { saveCsv(VideoInformationExtractor(df)) @@ -238,11 +298,15 @@ class CommandLineApp(conf: CmdAppConf) { }), "WebGraphExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webgraph() + var df = + RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webgraph() inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webgraph()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(WebGraphExtractor(df)) } else { saveCsv(WebGraphExtractor(df)) @@ -250,11 +314,15 @@ class CommandLineApp(conf: CmdAppConf) { }), "WebPagesExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages() + var df = + RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).webpages() inputFiles.tail foreach { f => df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).webpages()) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(WebPagesExtractor(df)) } else { saveCsv(WebPagesExtractor(df)) @@ -262,11 +330,18 @@ class CommandLineApp(conf: CmdAppConf) { }), "WordProcessorInformationExtractor" -> ((inputFiles: List[String]) => { - var df = RecordLoader.loadArchives(inputFiles.head, sparkCtx.get).wordProcessorFiles() + var df = RecordLoader + .loadArchives(inputFiles.head, sparkCtx.get) + .wordProcessorFiles() inputFiles.tail foreach { f => - df = df.union(RecordLoader.loadArchives(f, sparkCtx.get).wordProcessorFiles()) + df = df.union( + RecordLoader.loadArchives(f, sparkCtx.get).wordProcessorFiles() + ) } - if (!configuration.outputFormat.isEmpty && configuration.outputFormat() == "parquet") { + if ( + !configuration.outputFormat.isEmpty && configuration + .outputFormat() == "parquet" + ) { saveParquet(WordProcessorInformationExtractor(df)) } else { saveCsv(WordProcessorInformationExtractor(df)) @@ -283,7 +358,8 @@ class CommandLineApp(conf: CmdAppConf) { def saveCsv(d: Dataset[Row]): Unit = { if (!configuration.partition.isEmpty) { - d.coalesce(configuration.partition()).write + d.coalesce(configuration.partition()) + .write .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") .option("header", "true") .csv(saveTarget) @@ -303,7 +379,8 @@ class CommandLineApp(conf: CmdAppConf) { def saveParquet(d: Dataset[Row]): Unit = { if (!configuration.partition.isEmpty) { - d.coalesce(configuration.partition()).write + d.coalesce(configuration.partition()) + .write .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") .parquet(saveTarget) } else { @@ -326,7 +403,8 @@ class CommandLineApp(conf: CmdAppConf) { if (!Files.exists(Paths.get(f))) { logger.error(f + " not found") throw new IllegalArgumentException() - }} + } + } if (Files.exists(Paths.get(configuration.output()))) { logger.error(configuration.output() + " already exists") @@ -338,9 +416,9 @@ class CommandLineApp(conf: CmdAppConf) { * * @return String */ - def setAppName(): String = { - "aut - " + configuration.extractor() - } + def setAppName(): String = { + "aut - " + configuration.extractor() + } /** Prepare for invoking extractors. * @@ -349,8 +427,10 @@ class CommandLineApp(conf: CmdAppConf) { def handler(): Any = { if (!(extractors contains configuration.extractor())) { - logger.error(configuration.extractor() + " not supported. " + - "The following extractors are supported: ") + logger.error( + configuration.extractor() + " not supported. " + + "The following extractors are supported: " + ) extractors foreach { tuple => logger.error(tuple._1) } throw new IllegalArgumentException() } @@ -364,7 +444,9 @@ class CommandLineApp(conf: CmdAppConf) { if (!configuration.split.isEmpty && configuration.split()) { configuration.input() foreach { f => - saveTarget = Paths.get(configuration.output(), Paths.get(f).getFileName.toString).toString + saveTarget = Paths + .get(configuration.output(), Paths.get(f).getFileName.toString) + .toString extractFunction(List[String](f)) } } else { @@ -403,7 +485,7 @@ object CommandLineAppRunner { app.verifyArgumentsOrExit() } catch { case e: IllegalArgumentException => System.exit(1) - case x: Throwable => throw x + case x: Throwable => throw x } val appName = app.setAppName() diff --git a/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala b/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala index 6333f1d6..192f466e 100644 --- a/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.functions.desc import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object DomainFrequencyExtractor { + /** Extract domain frequency from web archive using DataFrame and Spark SQL. * * @param d DataFrame obtained from RecordLoader diff --git a/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala b/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala index f296323b..07299efc 100644 --- a/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala @@ -23,6 +23,7 @@ import org.apache.spark.sql.functions.desc import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object DomainGraphExtractor { + /** Extract domain graph from web archive using DataFrame and Spark SQL. * * @param d DataFrame obtained from RecordLoader @@ -34,13 +35,13 @@ object DomainGraphExtractor { import spark.implicits._ // scalastyle:on d.groupBy( - $"crawl_date", - removePrefixWWW(extractDomain($"src")).as("src_domain"), - removePrefixWWW(extractDomain($"dest")).as("dest_domain")) - .count() - .filter(!($"dest_domain"==="")) - .filter(!($"src_domain"==="")) - .filter($"count" > 5) - .orderBy(desc("count")) + $"crawl_date", + removePrefixWWW(extractDomain($"src")).as("src_domain"), + removePrefixWWW(extractDomain($"dest")).as("dest_domain") + ).count() + .filter(!($"dest_domain" === "")) + .filter(!($"src_domain" === "")) + .filter($"count" > 5) + .orderBy(desc("count")) } } diff --git a/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala b/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala index c7d31b81..9efbd77a 100644 --- a/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala +++ b/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala @@ -28,25 +28,44 @@ object ExtractPopularImages { val MIN_HEIGHT: Int = 30 /** Extracts the n most popular images from an RDD within a given size range. - * - * @param records - * @param limit number of most popular images in the output - * @param sc SparkContext - * @param minWidth of image - * @param minHeight of image - */ - def apply(records: RDD[ArchiveRecord], limit: Int, sc:SparkContext, minWidth: Int = MIN_WIDTH, minHeight: Int = MIN_HEIGHT): RDD[String] = { + * + * @param records + * @param limit number of most popular images in the output + * @param sc SparkContext + * @param minWidth of image + * @param minHeight of image + */ + def apply( + records: RDD[ArchiveRecord], + limit: Int, + sc: SparkContext, + minWidth: Int = MIN_WIDTH, + minHeight: Int = MIN_HEIGHT + ): RDD[String] = { val res = records .keepImages() .map(r => ((r.getUrl, r.getBinaryBytes), 1)) - .map(img => (ComputeMD5(img._1._2), (ComputeImageSize(img._1._2), img._1._1, img._2))) + .map(img => + ( + ComputeMD5(img._1._2), + (ComputeImageSize(img._1._2), img._1._1, img._2) + ) + ) .filter(img => img._2._1._1 >= minWidth && img._2._1._2 >= minHeight) - .reduceByKey((image1, image2) => (image1._1, image1._2, image1._3 + image2._3)) - .map(x=> (x._2._3, x._2._2)) + .reduceByKey((image1, image2) => + (image1._1, image1._2, image1._3 + image2._3) + ) + .map(x => (x._2._3, x._2._2)) .takeOrdered(limit)(Ordering[Int].on(x => -x._1)) - val numPartitions = if (limit <= LIMIT_MAXIMUM) 1 else Math.ceil(limit / LIMIT_DENOMINATOR).toInt + val numPartitions = + if (limit <= LIMIT_MAXIMUM) 1 + else Math.ceil(limit / LIMIT_DENOMINATOR).toInt val rdd = sc.parallelize(res) - rdd.repartitionAndSortWithinPartitions( - new RangePartitioner(numPartitions, rdd, false)).sortByKey(false).map(x=>x._1 + "\t" + x._2) + rdd + .repartitionAndSortWithinPartitions( + new RangePartitioner(numPartitions, rdd, false) + ) + .sortByKey(false) + .map(x => x._1 + "\t" + x._2) } } diff --git a/src/main/scala/io/archivesunleashed/app/ExtractPopularImagesDF.scala b/src/main/scala/io/archivesunleashed/app/ExtractPopularImagesDF.scala index 781558f9..a3fe4847 100644 --- a/src/main/scala/io/archivesunleashed/app/ExtractPopularImagesDF.scala +++ b/src/main/scala/io/archivesunleashed/app/ExtractPopularImagesDF.scala @@ -26,26 +26,32 @@ object ExtractPopularImagesDF { val MIN_HEIGHT: Int = 30 /** Extracts the n most popular images from an DataFrame within a given size range. - * - * @param d DataFrame obtained from RecordLoader - * @param limit number of most popular images in the output - * @param minWidth of image - * @param minHeight of image - * @return Dataset[Row], where the schema is (url, count) - */ - def apply(d: DataFrame, limit: Int, minWidth: Int = MIN_WIDTH, minHeight: Int = MIN_HEIGHT): Dataset[Row] = { + * + * @param d DataFrame obtained from RecordLoader + * @param limit number of most popular images in the output + * @param minWidth of image + * @param minHeight of image + * @return Dataset[Row], where the schema is (url, count) + */ + def apply( + d: DataFrame, + limit: Int, + minWidth: Int = MIN_WIDTH, + minHeight: Int = MIN_HEIGHT + ): Dataset[Row] = { val spark = SparkSession.builder().master("local").getOrCreate() // scalastyle:off import spark.implicits._ // scalastyle:on - val df = d.select($"url", $"md5") - .filter(($"width") >= minWidth && ($"height") >= minHeight) + val df = d + .select($"url", $"md5") + .filter(($"width") >= minWidth && ($"height") >= minHeight) val count = df.groupBy("md5").count() - df.join(count,"md5") + df.join(count, "md5") .groupBy("md5") .agg(first("url").as("url"), first("count").as("count")) .select("url", "count") diff --git a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala index 20bbfb2c..aa968459 100644 --- a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala @@ -20,6 +20,7 @@ import io.archivesunleashed.ArchiveRecord import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object ImageInformationExtractor { + /** Extract information about images from web archive using DataFrame * and Spark SQL. * @@ -32,14 +33,16 @@ object ImageInformationExtractor { // scalastyle:off import spark.implicits._ // scalastyle:on - d.select($"url", - $"filename", - $"extension", - $"mime_type_web_server", - $"mime_type_tika", - $"width", - $"height", - $"md5", - $"sha1") + d.select( + $"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"width", + $"height", + $"md5", + $"sha1" + ) } } diff --git a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala index 46e61bda..ea51374b 100644 --- a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala @@ -20,6 +20,7 @@ import io.archivesunleashed.ArchiveRecord import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object PDFInformationExtractor { + /** Extract information about PDFs from web archive using DataFrame * and Spark SQL. * @@ -32,12 +33,14 @@ object PDFInformationExtractor { // scalastyle:off import spark.implicits._ // scalastyle:on - d.select($"url", - $"filename", - $"extension", - $"mime_type_web_server", - $"mime_type_tika", - $"md5", - $"sha1") + d.select( + $"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1" + ) } } diff --git a/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala b/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala index afc6e696..55ed791f 100644 --- a/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala @@ -21,6 +21,7 @@ import io.archivesunleashed.udfs.{extractBoilerpipeText} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object PlainTextExtractor { + /** Extract plain text from web archive using DataFrame and Spark SQL. * * @param d DataFrame obtained from RecordLoader diff --git a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala index 7aa9d02f..98594fd6 100644 --- a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala @@ -20,6 +20,7 @@ import io.archivesunleashed.ArchiveRecord import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object PresentationProgramInformationExtractor { + /** Extract information about presentation program files * from web archive using DataFrame and Spark SQL. * @@ -32,12 +33,14 @@ object PresentationProgramInformationExtractor { // scalastyle:off import spark.implicits._ // scalastyle:on - d.select($"url", - $"filename", - $"extension", - $"mime_type_web_server", - $"mime_type_tika", - $"md5", - $"sha1") + d.select( + $"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1" + ) } } diff --git a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala index ec153068..e303add0 100644 --- a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala @@ -20,6 +20,7 @@ import io.archivesunleashed.ArchiveRecord import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object SpreadsheetInformationExtractor { + /** Extract information about spreadsheets from web archive using * DataFrame and Spark SQL. * @@ -32,12 +33,14 @@ object SpreadsheetInformationExtractor { // scalastyle:off import spark.implicits._ // scalastyle:on - d.select($"url", - $"filename", - $"extension", - $"mime_type_web_server", - $"mime_type_tika", - $"md5", - $"sha1") + d.select( + $"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1" + ) } } diff --git a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala index 654a3427..1cd3e392 100644 --- a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala @@ -20,6 +20,7 @@ import io.archivesunleashed.ArchiveRecord import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object VideoInformationExtractor { + /** Extract information about videos from web archive using DataFrame * and Spark SQL. * @@ -32,12 +33,14 @@ object VideoInformationExtractor { // scalastyle:off import spark.implicits._ // scalastyle:on - d.select($"url", - $"filename", - $"extension", - $"mime_type_web_server", - $"mime_type_tika", - $"md5", - $"sha1") + d.select( + $"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1" + ) } } diff --git a/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala b/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala index 9382cba9..51ffe6c8 100644 --- a/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala @@ -17,11 +17,16 @@ package io.archivesunleashed.app import io.archivesunleashed.ArchiveRecord -import io.archivesunleashed.udfs.{extractDomain, removeHTML, - removeHTTPHeader, removePrefixWWW} +import io.archivesunleashed.udfs.{ + extractDomain, + removeHTML, + removeHTTPHeader, + removePrefixWWW +} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object WebPagesExtractor { + /** Extract web pages from web archive using DataFrame and Spark SQL. * * @param d DataFrame obtained from RecordLoader @@ -33,12 +38,14 @@ object WebPagesExtractor { // scalastyle:off import spark.implicits._ // scalastyle:on - d.select($"crawl_date", - removePrefixWWW(extractDomain($"url")).as("domain"), - $"url", - $"mime_type_web_server", - $"mime_type_tika", - $"language", - removeHTML(removeHTTPHeader(($"content"))).alias("content")) + d.select( + $"crawl_date", + removePrefixWWW(extractDomain($"url")).as("domain"), + $"url", + $"mime_type_web_server", + $"mime_type_tika", + $"language", + removeHTML(removeHTTPHeader(($"content"))).alias("content") + ) } } diff --git a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala index 3ebc8bb1..a314a7cb 100644 --- a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala +++ b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala @@ -20,6 +20,7 @@ import io.archivesunleashed.ArchiveRecord import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object WordProcessorInformationExtractor { + /** Extract information about word processor files from web archive * using DataFrame and Spark SQL. * @@ -32,12 +33,14 @@ object WordProcessorInformationExtractor { // scalastyle:off import spark.implicits._ // scalastyle:on - d.select($"url", - $"filename", - $"extension", - $"mime_type_web_server", - $"mime_type_tika", - $"md5", - $"sha1") + d.select( + $"url", + $"filename", + $"extension", + $"mime_type_web_server", + $"mime_type_tika", + $"md5", + $"sha1" + ) } } diff --git a/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala b/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala index f850efee..edf13873 100644 --- a/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala +++ b/src/main/scala/io/archivesunleashed/app/WriteGEXF.scala @@ -21,17 +21,18 @@ import java.nio.file.{Files, Paths} import org.apache.spark.sql.Row object WriteGEXF { + /** Verifies gexfPath is empty. - * - * @param data Array[Row] of elements in format (crawl_date, src_domain, - * dest_domain, count) - * @param gexfPath output file - */ + * + * @param data Array[Row] of elements in format (crawl_date, src_domain, + * dest_domain, count) + * @param gexfPath output file + */ def apply(data: Array[Row], gexfPath: String): Boolean = { - if (gexfPath.isEmpty()) { + if (gexfPath.isEmpty()) { false } else { - makeFile (data, gexfPath) + makeFile(data, gexfPath) } } @@ -43,7 +44,8 @@ object WriteGEXF { * @return true on success. */ def makeFile(data: Array[Row], gexfPath: String): Boolean = { - val outFile = Files.newBufferedWriter(Paths.get(gexfPath), StandardCharsets.UTF_8) + val outFile = + Files.newBufferedWriter(Paths.get(gexfPath), StandardCharsets.UTF_8) val endAttribute = "\" />\n" val vertices = scala.collection.mutable.Set[String]() @@ -51,31 +53,41 @@ object WriteGEXF { vertices.add(d.get(1).asInstanceOf[String]) vertices.add(d.get(2).asInstanceOf[String]) } - outFile.write("\n" + - "\n" + - "\n" + - "\n" + - " \n" + - "\n" + - "\n") + outFile.write( + "\n" + + "\n" + + "\n" + + "\n" + + " \n" + + "\n" + + "\n" + ) vertices foreach { v => - outFile.write("\n\n") data foreach { e => - outFile.write("\n" + - "\n" + - "\n" + - "\n") + outFile.write( + "\n" + + "\n" + + "\n" + + "\n" + ) } outFile.write("\n\n") outFile.close() diff --git a/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala index 8686ed65..a4cd2d22 100644 --- a/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala +++ b/src/main/scala/io/archivesunleashed/app/WriteGraphML.scala @@ -21,29 +21,31 @@ import java.nio.file.{Files, Paths} import org.apache.spark.sql.Row object WriteGraphML { + /** Verifies graphmlPath is empty. - * - * @param data Array[Row] elements in format (crawl_date, src_domain, - * dest_domain, count) - * @param graphmlPath output file - */ + * + * @param data Array[Row] elements in format (crawl_date, src_domain, + * dest_domain, count) + * @param graphmlPath output file + */ def apply(data: Array[Row], graphmlPath: String): Boolean = { if (graphmlPath.isEmpty()) { false } else { - makeFile (data, graphmlPath) + makeFile(data, graphmlPath) } } /** Produces the GraphML output from an Array[Row] and outputs it to graphmlPath. - * - * @param data a Dataset[Row] of elements in format (crawl_date, src_domain, - * dest_domain, count) - * @param graphmlPath output file - * @return true on success. - */ + * + * @param data a Dataset[Row] of elements in format (crawl_date, src_domain, + * dest_domain, count) + * @param graphmlPath output file + * @return true on success. + */ def makeFile(data: Array[Row], graphmlPath: String): Boolean = { - val outFile = Files.newBufferedWriter(Paths.get(graphmlPath), StandardCharsets.UTF_8) + val outFile = + Files.newBufferedWriter(Paths.get(graphmlPath), StandardCharsets.UTF_8) val nodes = scala.collection.mutable.Set[String]() data foreach { d => @@ -51,32 +53,46 @@ object WriteGraphML { nodes.add(d.get(2).asInstanceOf[String]) } - outFile.write("\n" + - "" + - "\n" + - "\n" + - "0.0\n" + - "\n" + - "\n" + - "\n") + outFile.write( + "\n" + + "" + + "\n" + + "\n" + + "0.0\n" + + "\n" + + "\n" + + "\n" + ) nodes foreach { n => - outFile.write("\n" + - "" + n.asInstanceOf[String].escapeInvalidXML() + "\n\n") + outFile.write( + "\n" + + "" + n + .asInstanceOf[String] + .escapeInvalidXML() + "\n\n" + ) } data foreach { e => - outFile.write("\n" + - "" + e.get(3) + "\n" + - "" + e.get(0) + "\n" + - "\n") + outFile.write( + "\n" + + "" + e.get(3) + "\n" + + "" + e.get(0) + "\n" + + "\n" + ) } - outFile.write("\n" + - "") + outFile.write( + "\n" + + "" + ) outFile.close() true } diff --git a/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala b/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala index 53e44bb9..57cdaa15 100644 --- a/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala +++ b/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala @@ -20,73 +20,72 @@ import io.archivesunleashed.RecordLoader import org.apache.spark.SparkContext import org.apache.spark.sql.DataFrame -/** DataFrame wrapper for PySpark implementation. **/ +/** DataFrame wrapper for PySpark implementation. * */ class DataFrameLoader(sc: SparkContext) { /** Create a DataFrame with crawl_date, url, mime_type_web_server, mime_type_tika, content, bytes, http_status_code, and archive_filename. */ def all(path: String): DataFrame = { - RecordLoader.loadArchives(path, sc) + RecordLoader + .loadArchives(path, sc) .keepValidPages() .all() } /** Create a DataFrame with audio url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */ def audio(path: String): DataFrame = { - RecordLoader.loadArchives(path, sc) - .audio + RecordLoader.loadArchives(path, sc).audio } /* Create a DataFrame with crawl date, source page, image url, and alt text. */ def imagegraph(path: String): DataFrame = { - RecordLoader.loadArchives(path, sc) + RecordLoader + .loadArchives(path, sc) .imagegraph() } /** Create a DataFrame with image url, filename, extension, mime_type_web_server, mime_type_tika, width, height, md5, sha1, and raw bytes. */ def images(path: String): DataFrame = { - RecordLoader.loadArchives(path, sc) + RecordLoader + .loadArchives(path, sc) .images() } /** Create a DataFrame with PDF url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */ def pdfs(path: String): DataFrame = { - RecordLoader.loadArchives(path, sc) - .pdfs + RecordLoader.loadArchives(path, sc).pdfs } /** Create a DataFrame with presentation program file url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */ def presentationProgramFiles(path: String): DataFrame = { - RecordLoader.loadArchives(path, sc) - .presentationProgramFiles + RecordLoader.loadArchives(path, sc).presentationProgramFiles } /** Create a DataFrame with spreadsheet url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */ def spreadsheets(path: String): DataFrame = { - RecordLoader.loadArchives(path, sc) - .spreadsheets + RecordLoader.loadArchives(path, sc).spreadsheets } /** Create a DataFrame with video url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */ def videos(path: String): DataFrame = { - RecordLoader.loadArchives(path, sc) - .videos + RecordLoader.loadArchives(path, sc).videos } /** Create a DataFrame with crawl_date, source, destination, and anchor. */ def webgraph(path: String): DataFrame = { - RecordLoader.loadArchives(path, sc) + RecordLoader + .loadArchives(path, sc) .webgraph() } /** Create a DataFrame with crawl_date, url, mime_type_web_server, language, and content. */ def webpages(path: String): DataFrame = { - RecordLoader.loadArchives(path, sc) + RecordLoader + .loadArchives(path, sc) .webpages() } /** Create a DataFrame with word processor file url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */ def wordProcessorFiles(path: String): DataFrame = { - RecordLoader.loadArchives(path, sc) - .wordProcessorFiles + RecordLoader.loadArchives(path, sc).wordProcessorFiles } - } +} diff --git a/src/main/scala/io/archivesunleashed/df/package.scala b/src/main/scala/io/archivesunleashed/df/package.scala index 677711d8..727b442d 100644 --- a/src/main/scala/io/archivesunleashed/df/package.scala +++ b/src/main/scala/io/archivesunleashed/df/package.scala @@ -25,9 +25,9 @@ import org.apache.spark.sql.DataFrame package object df { /** - * Given a dataframe, serializes binary object and saves to disk - * @param df the input dataframe - */ + * Given a dataframe, serializes binary object and saves to disk + * @param df the input dataframe + */ implicit class SaveBytes(df: DataFrame) { /** @@ -36,24 +36,30 @@ package object df { * @param extensionColumnName the name of the column containin the extension * e.g. fileName = "foo" => files are saved as "foo-[MD5 hash].pdf" */ - def saveToDisk(bytesColumnName: String, fileName: String, extensionColumnName: String): Unit = { - df.select(bytesColumnName, extensionColumnName).foreach(row => { - try { - // Assumes the bytes are base64 encoded. - val encodedBytes: String = row.getAs(bytesColumnName); - val bytes = Base64.getDecoder.decode(encodedBytes); - val in = new ByteArrayInputStream(bytes); + def saveToDisk( + bytesColumnName: String, + fileName: String, + extensionColumnName: String + ): Unit = { + df.select(bytesColumnName, extensionColumnName) + .foreach(row => { + try { + // Assumes the bytes are base64 encoded. + val encodedBytes: String = row.getAs(bytesColumnName); + val bytes = Base64.getDecoder.decode(encodedBytes); + val in = new ByteArrayInputStream(bytes); - val extension: String = row.getAs(extensionColumnName); - val suffix = ComputeMD5(bytes) - val file = new FileOutputStream(fileName + "-" + suffix + "." + extension.toLowerCase) - IOUtils.copy(in, file) - file.close() - } catch { - case e: Throwable => { + val extension: String = row.getAs(extensionColumnName); + val suffix = ComputeMD5(bytes) + val file = new FileOutputStream( + fileName + "-" + suffix + "." + extension.toLowerCase + ) + IOUtils.copy(in, file) + file.close() + } catch { + case e: Throwable => {} } - } - }) + }) } } } diff --git a/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala b/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala index 15e2a94b..bbf8d8eb 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ComputeImageSize.scala @@ -22,13 +22,13 @@ import javax.imageio.ImageIO object ComputeImageSize { /** Computes image size from a byte array using ImageIO. - * - * Used by `ExtractPopularImages` to calculate the size of - * the image as a tuple of integers (width, height). - * - * @param bytes image as a byte array - * @return size of image as a tuple (width, height) or (0,0). - */ + * + * Used by `ExtractPopularImages` to calculate the size of + * the image as a tuple of integers (width, height). + * + * @param bytes image as a byte array + * @return size of image as a tuple (width, height) or (0,0). + */ def apply(bytes: Array[Byte]): (Int, Int) = { val nullImage = (0, 0) try { diff --git a/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala b/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala index ded60bb1..fda08e79 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ComputeMD5.scala @@ -19,12 +19,17 @@ import java.security.MessageDigest /** Compute MD5 checksum. */ object ComputeMD5 { + /** Computes the MD5 checksum of a byte array (eg. an image). * * @param bytes * @return MD5 checksum. */ def apply(bytes: Array[Byte]): String = { - MessageDigest.getInstance("MD5").digest(bytes).map("%02x".format(_)).mkString + MessageDigest + .getInstance("MD5") + .digest(bytes) + .map("%02x".format(_)) + .mkString } } diff --git a/src/main/scala/io/archivesunleashed/matchbox/ComputeSHA1.scala b/src/main/scala/io/archivesunleashed/matchbox/ComputeSHA1.scala index bf454971..2ec165ba 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ComputeSHA1.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ComputeSHA1.scala @@ -27,6 +27,10 @@ object ComputeSHA1 { * @return SHA1 checksum. */ def apply(bytes: Array[Byte]): String = { - MessageDigest.getInstance("SHA1").digest(bytes).map("%02x".format(_)).mkString + MessageDigest + .getInstance("SHA1") + .digest(bytes) + .map("%02x".format(_)) + .mkString } } diff --git a/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala b/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala index 3934513f..da4c14de 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/DetectLanguage.scala @@ -23,16 +23,16 @@ import org.apache.tika.language.detect.LanguageResult; object DetectLanguage { /** Detects the language of a String input. - * - * @param input the string for which language can be detected - * @return ISO 639-2 language code (eg. "en", "fr" or "it"). - */ + * + * @param input the string for which language can be detected + * @return ISO 639-2 language code (eg. "en", "fr" or "it"). + */ def apply(input: String): String = { if (input.isEmpty) { "" } else { val detector: LanguageDetector = new OptimaizeLangDetector().loadModels() - val result : LanguageResult = detector.detect(input) + val result: LanguageResult = detector.detect(input) result.getLanguage() } } diff --git a/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala b/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala index aff45496..06c47365 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala @@ -33,10 +33,10 @@ object DetectMimeTypeTika { val allMimeTypes = MimeTypes.getDefaultMimeTypes(); /** Detect MIME type from an input string. - * - * @param content a byte array of content for which to detect the MimeType - * @return MIME type (e.g. "text/html" or "application/xml") or "N/A". - */ + * + * @param content a byte array of content for which to detect the MimeType + * @return MIME type (e.g. "text/html" or "application/xml") or "N/A". + */ def apply(content: Array[Byte]): String = { if (content.size == 0) { "N/A" @@ -49,20 +49,20 @@ object DetectMimeTypeTika { } /** Return the best guess at a file extension from a MIME type string - * - * @param mimeType string representation of the MimeType - * @return file extension (e.g. ".jpg" for "image/jpeg"). - */ + * + * @param mimeType string representation of the MimeType + * @return file extension (e.g. ".jpg" for "image/jpeg"). + */ def getExtension(mimeType: String): String = { val regMimeType = allMimeTypes.forName(mimeType) regMimeType.getExtension } /** Return the list of all known file extensions for a MIME type string - * - * @param mimeType string representation of the MimeType - * @return list of file extensions (e.g. ".jpg" for "image/jpeg"). - */ + * + * @param mimeType string representation of the MimeType + * @return list of file extensions (e.g. ".jpg" for "image/jpeg"). + */ def getExtensions(mimeType: String): List[String] = { val regMimeType = allMimeTypes.forName(mimeType) regMimeType.getExtensions.asScala.toList diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala index 2924a102..77b681d1 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala @@ -18,23 +18,28 @@ package io.archivesunleashed.matchbox import de.l3s.boilerpipe.extractors.DefaultExtractor import java.io.IOException -/** Extract raw text content from an HTML page, minus "boilerplate" content (using boilerpipe). */ +/** Extract raw text content from an HTML page, minus "boilerplate" content (using boilerpipe). */ object ExtractBoilerpipeText { + /** Uses boilerpipe to extract raw text content from a page. - * - * ExtractBoilerpipeText removes boilerplate text (e.g. a copyright statement) from an HTML string. - * - * @param input an html string possibly containing boilerpipe text - * @return text with boilerplate removed or Nil if the text is empty. - */ + * + * ExtractBoilerpipeText removes boilerplate text (e.g. a copyright statement) from an HTML string. + * + * @param input an html string possibly containing boilerpipe text + * @return text with boilerplate removed or Nil if the text is empty. + */ def apply(input: String): String = { removeBoilerplate(RemoveHTTPHeader(input)) } private def removeBoilerplate(input: String): String = { - val maybeInput = Option(DefaultExtractor.INSTANCE - .getText(input).replaceAll("[\\r\\n]+", " ").trim()) + val maybeInput = Option( + DefaultExtractor.INSTANCE + .getText(input) + .replaceAll("[\\r\\n]+", " ") + .trim() + ) maybeInput match { case Some(text) => text diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala index ac697478..4dc44efb 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDate.scala @@ -18,6 +18,7 @@ package io.archivesunleashed.matchbox /** Gets different parts of a dateString. */ object ExtractDate { object DateComponent extends Enumeration { + /** An enum specifying years, months, days or a combination. */ type DateComponent = Value val YYYY, MM, DD, YYYYMM, YYYYMMDD = Value @@ -39,11 +40,11 @@ object ExtractDate { maybeFullDate match { case Some(fulldate) => dateFormat match { - case YYYY => fullDate.substring(startSS, yearSS) - case MM => fullDate.substring(yearSS, monthSS) - case DD => fullDate.substring(monthSS, daySS) + case YYYY => fullDate.substring(startSS, yearSS) + case MM => fullDate.substring(yearSS, monthSS) + case DD => fullDate.substring(monthSS, daySS) case YYYYMM => fullDate.substring(startSS, monthSS) - case _ => fullDate.substring(startSS, daySS) + case _ => fullDate.substring(startSS, daySS) } case None => "" @@ -64,11 +65,11 @@ object ExtractDate { maybeFullDate match { case Some(fulldate) => dateFormat match { - case "YYYY" => fullDate.substring(startSS, yearSS) - case "MM" => fullDate.substring(yearSS, monthSS) - case "DD" => fullDate.substring(monthSS, daySS) + case "YYYY" => fullDate.substring(startSS, yearSS) + case "MM" => fullDate.substring(yearSS, monthSS) + case "DD" => fullDate.substring(monthSS, daySS) case "YYYYMM" => fullDate.substring(startSS, monthSS) - case _ => fullDate.substring(startSS, daySS) + case _ => fullDate.substring(startSS, daySS) } case None => "" diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala index c0aef0cd..c609c837 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala @@ -19,12 +19,13 @@ import java.net.URL /** Extracts the host domain name from a full url string. */ object ExtractDomain { + /** Extract source domains from a full url string. - * - * @param url a url as a string - * @param source an optional default url for urls with no valid domain host - * @return domain host, source or null if url is null. - */ + * + * @param url a url as a string + * @param source an optional default url for urls with no valid domain host + * @return domain host, source or null if url is null. + */ def apply(url: String, source: String = ""): String = { val maybeHost: Option[URL] = checkUrl(url) val maybeSource: Option[URL] = checkUrl(source) @@ -38,7 +39,7 @@ object ExtractDomain { source.getHost case None => "" - } + } } } diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala index b735c848..3bd05222 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractImageDetails.scala @@ -26,8 +26,12 @@ class ImageDetails(imageUrl: String, imageType: String, bytes: Array[Byte]) { val height = dimensions._2 val url: String = imageUrl val mimeType: String = imageType - val md5Hash: String = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))) - val sha1Hash: String = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))) + val md5Hash: String = new String( + Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)) + ) + val sha1Hash: String = new String( + Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)) + ) val body: String = Base64.getEncoder.encodeToString(bytes) } @@ -35,9 +39,9 @@ class ImageDetails(imageUrl: String, imageType: String, bytes: Array[Byte]) { object ExtractImageDetails { /** - * @param bytes the raw bytes of the image - * @return A tuple containing the width and height of the image - */ + * @param bytes the raw bytes of the image + * @return A tuple containing the width and height of the image + */ def apply(url: String, mimeType: String, bytes: Array[Byte]): ImageDetails = { new ImageDetails(url, mimeType, bytes) } diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala index e96be01a..015e088b 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala @@ -31,14 +31,18 @@ object ExtractLinks { * @param base an optional base URI * @return a sequence of (source, target, anchortext). */ - def apply(src: String, html: String, base: String = ""): Seq[(String, String, String)] = { + def apply( + src: String, + html: String, + base: String = "" + ): Seq[(String, String, String)] = { val srcMaybe: Option[String] = Option(src) val htmlMaybe: Option[String] = Option(html) val output = mutable.MutableList[(String, String, String)]() srcMaybe match { case Some(valid_src) => htmlMaybe match { - case Some (valid_html) => + case Some(valid_html) => val doc = Jsoup.parse(valid_html) val links: Elements = doc.select("a[href]") val it = links.iterator() @@ -51,11 +55,11 @@ object ExtractLinks { } } case None => - // do nothing - } + // do nothing + } case None => - // do nothing - } + // do nothing + } output } } diff --git a/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala index 3903e17d..49f061d0 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala @@ -22,10 +22,10 @@ import org.jsoup.Jsoup object RemoveHTML { /** Removes HTML markup. - * - * @param content an html or text string - * @return content without html markup. - */ + * + * @param content an html or text string + * @return content without html markup. + */ def apply(content: String): String = { // First remove the HTTP header. val maybeContent: Option[String] = Option(RemoveHTTPHeader(content)) diff --git a/src/main/scala/io/archivesunleashed/matchbox/RemoveHTTPHeader.scala b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTTPHeader.scala index e074698d..80b54b7e 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/RemoveHTTPHeader.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTTPHeader.scala @@ -20,15 +20,15 @@ object RemoveHTTPHeader { val headerEnd = "\r\n\r\n" /** Remove HTTP headers. - * - * @param content string of WARC or ARC-based text content - * @return string with HTTP headers removed. - */ + * + * @param content string of WARC or ARC-based text content + * @return string with HTTP headers removed. + */ def apply(content: String): String = { val maybeContent: Option[String] = Option(content) maybeContent match { case Some(content) => - if (content.startsWith("HTTP/")){ + if (content.startsWith("HTTP/")) { content.substring(content.indexOf(headerEnd) + headerEnd.length) } else { content diff --git a/src/main/scala/io/archivesunleashed/matchbox/package.scala b/src/main/scala/io/archivesunleashed/matchbox/package.scala index ca0b8bf4..fe954517 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/package.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/package.scala @@ -20,7 +20,6 @@ import java.io.IOException import java.security.MessageDigest import scala.xml.Utility.escape - /** Package object which supplies implicits providing common UDF-related functionalities. */ package object matchbox { implicit class WWWLink(s: String) { @@ -28,16 +27,16 @@ package object matchbox { val maybeString: Option[String] = Option(s) maybeString match { case Some(s) => s.replaceAll("^\\s*www\\.", "") - case None => "" + case None => "" } } def escapeInvalidXML(): String = { try { escape(s) - } - catch { - case e: Exception => throw new IOException("Caught exception processing input row ", e) + } catch { + case e: Exception => + throw new IOException("Caught exception processing input row ", e) } } } diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala index eadd14c3..3e6a4d8c 100644 --- a/src/main/scala/io/archivesunleashed/package.scala +++ b/src/main/scala/io/archivesunleashed/package.scala @@ -20,14 +20,32 @@ import java.security.MessageDigest import java.util.Base64 import io.archivesunleashed.data.ArchiveRecordWritable.ArchiveFormat -import io.archivesunleashed.data.{ArchiveRecordInputFormat, ArchiveRecordWritable} +import io.archivesunleashed.data.{ + ArchiveRecordInputFormat, + ArchiveRecordWritable +} import ArchiveRecordWritable.ArchiveFormat -import io.archivesunleashed.udfs.{detectLanguage, detectMimeTypeTika, extractDate, extractDomain, removeHTML} +import io.archivesunleashed.udfs.{ + detectLanguage, + detectMimeTypeTika, + extractDate, + extractDomain, + removeHTML +} -import io.archivesunleashed.matchbox.{DetectLanguage, DetectMimeTypeTika, ExtractDate, - ExtractDomain, ExtractImageDetails, ExtractImageLinks, - ExtractLinks, GetExtensionMIME, RemoveHTML, RemoveHTTPHeader} +import io.archivesunleashed.matchbox.{ + DetectLanguage, + DetectMimeTypeTika, + ExtractDate, + ExtractDomain, + ExtractImageDetails, + ExtractImageLinks, + ExtractLinks, + GetExtensionMIME, + RemoveHTML, + RemoveHTTPHeader +} import io.archivesunleashed.matchbox.ExtractDate.DateComponent import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent import java.net.URI @@ -38,7 +56,13 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.LongWritable import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.{lit, udf} -import org.apache.spark.sql.types.{BinaryType, IntegerType, StringType, StructField, StructType} +import org.apache.spark.sql.types.{ + BinaryType, + IntegerType, + StringType, + StructField, + StructType +} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.{RangePartitioner, SerializableWritable, SparkContext} import scala.reflect.ClassTag @@ -49,8 +73,10 @@ import scala.util.Try * Package object which supplies implicits to augment generic RDDs with AUT-specific transformations. */ package object archivesunleashed { + /** Loads records from either WARCs or ARCs. */ object RecordLoader { + /** Gets all non-empty archive files. * * @param dir the path to the directory containing archive files @@ -59,7 +85,9 @@ package object archivesunleashed { */ def getFiles(dir: Path, fs: FileSystem): String = { val statuses = fs.globStatus(dir) - val files = statuses.filter(f => fs.getContentSummary(f.getPath).getLength > 0).map(f => f.getPath) + val files = statuses + .filter(f => fs.getContentSummary(f.getPath).getLength > 0) + .map(f => f.getPath) files.mkString(",") } @@ -73,17 +101,26 @@ package object archivesunleashed { val uri = new URI(path) val fs = FileSystem.get(uri, sc.hadoopConfiguration) val p = new Path(path) - sc.newAPIHadoopFile(getFiles(p, fs), classOf[ArchiveRecordInputFormat], classOf[LongWritable], classOf[ArchiveRecordWritable]) - .filter(r => (r._2.getFormat == ArchiveFormat.ARC) || - ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader.getHeaderValue("WARC-Type").equals("response"))) - .map(r => new ArchiveRecordImpl(new SerializableWritable(r._2))) + sc.newAPIHadoopFile( + getFiles(p, fs), + classOf[ArchiveRecordInputFormat], + classOf[LongWritable], + classOf[ArchiveRecordWritable] + ).filter(r => + (r._2.getFormat == ArchiveFormat.ARC) || + ((r._2.getFormat == ArchiveFormat.WARC) && r._2.getRecord.getHeader + .getHeaderValue("WARC-Type") + .equals("response")) + ).map(r => new ArchiveRecordImpl(new SerializableWritable(r._2))) } } /** A Wrapper class around RDD to simplify counting. */ - implicit class CountableRDD[T: ClassTag](rdd: RDD[T]) extends java.io.Serializable { + implicit class CountableRDD[T: ClassTag](rdd: RDD[T]) + extends java.io.Serializable { def countItems(): RDD[(T, Int)] = { - rdd.map(r => (r, 1)) + rdd + .map(r => (r, 1)) .reduceByKey((c1, c2) => c1 + c2) .sortBy(f => f._2, ascending = false) } @@ -104,13 +141,13 @@ package object archivesunleashed { /** Removes all non-html-based data (images, executables, etc.) from html text. */ def keepValidPagesDF(): DataFrame = { df.filter($"crawl_date" isNotNull) - .filter(!($"url".rlike(".*robots\\.txt$")) && - ( $"mime_type_web_server".rlike("text/html") || - $"mime_type_web_server".rlike("application/xhtml+xml") || - $"url".rlike("(?i).*htm$") || - $"url".rlike("(?i).*html$") - ) - ) + .filter( + !($"url".rlike(".*robots\\.txt$")) && + ($"mime_type_web_server".rlike("text/html") || + $"mime_type_web_server".rlike("application/xhtml+xml") || + $"url".rlike("(?i).*htm$") || + $"url".rlike("(?i).*html$")) + ) .filter($"http_status_code" === 200) } } @@ -120,14 +157,24 @@ package object archivesunleashed { * * To load such an RDD, please see [[RecordLoader]]. */ - implicit class WARecordRDD(rdd: RDD[ArchiveRecord]) extends java.io.Serializable { + implicit class WARecordRDD(rdd: RDD[ArchiveRecord]) + extends java.io.Serializable { /* Creates a column for Bytes as well in Dataframe. Call KeepImages OR KeepValidPages on RDD depending upon the requirement before calling this method */ def all(): DataFrame = { - val records = rdd.map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType, - DetectMimeTypeTika(r.getBinaryBytes), r.getContentString, - r.getBinaryBytes, r.getHttpStatus, r.getArchiveFilename)) + val records = rdd.map(r => + Row( + r.getCrawlDate, + r.getUrl, + r.getMimeType, + DetectMimeTypeTika(r.getBinaryBytes), + r.getContentString, + r.getBinaryBytes, + r.getHttpStatus, + r.getArchiveFilename + ) + ) val schema = new StructType() .add(StructField("crawl_date", StringType, true)) @@ -148,20 +195,28 @@ package object archivesunleashed { rdd.filter(r => r.getCrawlDate != null && (r.getMimeType == "text/html" - || r.getMimeType == "application/xhtml+xml" - || r.getUrl.toLowerCase.endsWith("htm") - || r.getUrl.toLowerCase.endsWith("html")) + || r.getMimeType == "application/xhtml+xml" + || r.getUrl.toLowerCase.endsWith("htm") + || r.getUrl.toLowerCase.endsWith("html")) && !r.getUrl.toLowerCase.endsWith("robots.txt") - && r.getHttpStatus == "200") + && r.getHttpStatus == "200" + ) } /** Extracts webpages with columns for crawl data, url, MIME type, and content. */ def webpages(): DataFrame = { - val records = rdd.keepValidPages() - .map(r => Row(r.getCrawlDate, r.getUrl, r.getMimeType, - DetectMimeTypeTika(r.getBinaryBytes), - DetectLanguage(RemoveHTML(RemoveHTTPHeader(r.getContentString))), - r.getContentString)) + val records = rdd + .keepValidPages() + .map(r => + Row( + r.getCrawlDate, + r.getUrl, + r.getMimeType, + DetectMimeTypeTika(r.getBinaryBytes), + DetectLanguage(RemoveHTML(RemoveHTTPHeader(r.getContentString))), + r.getContentString + ) + ) val schema = new StructType() .add(StructField("crawl_date", StringType, true)) @@ -179,8 +234,10 @@ package object archivesunleashed { def webgraph(): DataFrame = { val records = rdd .keepValidPages() - .flatMap(r => ExtractLinks(r.getUrl, r.getContentString) - .map(t => (r.getCrawlDate, t._1, t._2, t._3))) + .flatMap(r => + ExtractLinks(r.getUrl, r.getContentString) + .map(t => (r.getCrawlDate, t._1, t._2, t._3)) + ) .filter(t => t._2 != "" && t._3 != "") .map(t => Row(t._1, t._2, t._3, t._4)) @@ -198,8 +255,10 @@ package object archivesunleashed { def imagegraph(): DataFrame = { val records = rdd .keepValidPages() - .flatMap(r => ExtractImageLinks(r.getUrl, r.getContentString) - .map(t => (r.getCrawlDate, t._1, t._2, t._3))) + .flatMap(r => + ExtractImageLinks(r.getUrl, r.getContentString) + .map(t => (r.getCrawlDate, t._1, t._2, t._3)) + ) .filter(t => t._2 != "" && t._3 != "") .map(t => Row(t._1, t._2, t._3, t._4)) @@ -219,14 +278,40 @@ package object archivesunleashed { .keepImages() .map(r => { val mimeTypeTika = DetectMimeTypeTika(r.getBinaryBytes) - val image = ExtractImageDetails(r.getUrl, mimeTypeTika, r.getBinaryBytes) + val image = + ExtractImageDetails(r.getUrl, mimeTypeTika, r.getBinaryBytes) val url = new URL(r.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMIME(url.getPath(), mimeTypeTika) - (r.getCrawlDate, r.getUrl, filename, extension, r.getMimeType, mimeTypeTika, - image.width, image.height, image.md5Hash, image.sha1Hash, image.body) + ( + r.getCrawlDate, + r.getUrl, + filename, + extension, + r.getMimeType, + mimeTypeTika, + image.width, + image.height, + image.md5Hash, + image.sha1Hash, + image.body + ) }) - .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10, t._11)) + .map(t => + Row( + t._1, + t._2, + t._3, + t._4, + t._5, + t._6, + t._7, + t._8, + t._9, + t._10, + t._11 + ) + ) val schema = new StructType() .add(StructField("crawl_date", StringType, true)) @@ -248,20 +333,31 @@ package object archivesunleashed { /* Extract PDF bytes and PDF metadata. */ def pdfs(): DataFrame = { val records = rdd - .map(r => - (r, (DetectMimeTypeTika(r.getBinaryBytes))) - ) + .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes)))) .filter(r => r._2 == "application/pdf") .map(r => { val bytes = r._1.getBinaryBytes - val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))) - val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))) + val md5Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)) + ) + val sha1Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)) + ) val encodedBytes = Base64.getEncoder.encodeToString(bytes) val url = new URL(r._1.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMIME(url.getPath(), r._2) - (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, - DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) + ( + r._1.getCrawlDate, + r._1.getUrl, + filename, + extension, + r._1.getMimeType, + DetectMimeTypeTika(r._1.getBinaryBytes), + md5Hash, + sha1Hash, + encodedBytes + ) }) .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) @@ -283,20 +379,31 @@ package object archivesunleashed { /* Extract audio bytes and audio metadata. */ def audio(): DataFrame = { val records = rdd - .map(r => - (r, (DetectMimeTypeTika(r.getBinaryBytes))) - ) + .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes)))) .filter(r => r._2.startsWith("audio/")) .map(r => { val bytes = r._1.getBinaryBytes - val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))) - val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))) + val md5Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)) + ) + val sha1Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)) + ) val encodedBytes = Base64.getEncoder.encodeToString(bytes) val url = new URL(r._1.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMIME(url.getPath(), r._2) - (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, - DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) + ( + r._1.getCrawlDate, + r._1.getUrl, + filename, + extension, + r._1.getMimeType, + DetectMimeTypeTika(r._1.getBinaryBytes), + md5Hash, + sha1Hash, + encodedBytes + ) }) .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) @@ -318,20 +425,31 @@ package object archivesunleashed { /* Extract video bytes and video metadata. */ def videos(): DataFrame = { val records = rdd - .map(r => - (r, (DetectMimeTypeTika(r.getBinaryBytes))) - ) + .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes)))) .filter(r => r._2.startsWith("video/")) .map(r => { val bytes = r._1.getBinaryBytes - val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))) - val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))) + val md5Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)) + ) + val sha1Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)) + ) val encodedBytes = Base64.getEncoder.encodeToString(bytes) val url = new URL(r._1.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMIME(url.getPath(), r._2) - (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, - DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) + ( + r._1.getCrawlDate, + r._1.getUrl, + filename, + extension, + r._1.getMimeType, + DetectMimeTypeTika(r._1.getBinaryBytes), + md5Hash, + sha1Hash, + encodedBytes + ) }) .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) @@ -353,39 +471,43 @@ package object archivesunleashed { /* Extract spreadsheet bytes and spreadsheet metadata. */ def spreadsheets(): DataFrame = { val records = rdd - .map(r => - (r, (DetectMimeTypeTika(r.getBinaryBytes))) - ) - .filter(r => (r._2 == "application/vnd.ms-excel" - || r._2 == "application/vnd.ms-excel.workspace.3" - || r._2 == "application/vnd.ms-excel.workspace.4" - || r._2 == "application/vnd.ms-excel.sheet.2" - || r._2 == "application/vnd.ms-excel.sheet.3" - || r._2 == "application/vnd.ms-excel.sheet.3" - || r._2 == "application/vnd.ms-excel.addin.macroenabled.12" - || r._2 == "application/vnd.ms-excel.sheet.binary.macroenabled.12" - || r._2 == "application/vnd.ms-excel.sheet.macroenabled.12" - || r._2 == "application/vnd.ms-excel.template.macroenabled.12" - || r._2 == "application/vnd.ms-spreadsheetml" - || r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.template" - || r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" - || r._2 == "application/x-vnd.oasis.opendocument.spreadsheet-template" - || r._2 == "application/vnd.oasis.opendocument.spreadsheet-template" - || r._2 == "application/vnd.oasis.opendocument.spreadsheet" - || r._2 == "application/x-vnd.oasis.opendocument.spreadsheet" - || r._2 == "application/x-tika-msworks-spreadsheet" - || r._2 == "application/vnd.lotus-1-2-3" - || r._2 == "text/csv" // future versions of Tika? - || r._2 == "text/tab-separated-values" // " " - || r._1.getMimeType == "text/csv" - || r._1.getMimeType == "text/tab-separated-values") - || ((r._1.getUrl.toLowerCase.endsWith(".csv") - || r._1.getUrl.toLowerCase.endsWith(".tsv")) - && r._2 == "text/plain")) + .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes)))) + .filter(r => + (r._2 == "application/vnd.ms-excel" + || r._2 == "application/vnd.ms-excel.workspace.3" + || r._2 == "application/vnd.ms-excel.workspace.4" + || r._2 == "application/vnd.ms-excel.sheet.2" + || r._2 == "application/vnd.ms-excel.sheet.3" + || r._2 == "application/vnd.ms-excel.sheet.3" + || r._2 == "application/vnd.ms-excel.addin.macroenabled.12" + || r._2 == "application/vnd.ms-excel.sheet.binary.macroenabled.12" + || r._2 == "application/vnd.ms-excel.sheet.macroenabled.12" + || r._2 == "application/vnd.ms-excel.template.macroenabled.12" + || r._2 == "application/vnd.ms-spreadsheetml" + || r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.template" + || r._2 == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + || r._2 == "application/x-vnd.oasis.opendocument.spreadsheet-template" + || r._2 == "application/vnd.oasis.opendocument.spreadsheet-template" + || r._2 == "application/vnd.oasis.opendocument.spreadsheet" + || r._2 == "application/x-vnd.oasis.opendocument.spreadsheet" + || r._2 == "application/x-tika-msworks-spreadsheet" + || r._2 == "application/vnd.lotus-1-2-3" + || r._2 == "text/csv" // future versions of Tika? + || r._2 == "text/tab-separated-values" // " " + || r._1.getMimeType == "text/csv" + || r._1.getMimeType == "text/tab-separated-values") + || ((r._1.getUrl.toLowerCase.endsWith(".csv") + || r._1.getUrl.toLowerCase.endsWith(".tsv")) + && r._2 == "text/plain") + ) .map(r => { val bytes = r._1.getBinaryBytes - val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))) - val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))) + val md5Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)) + ) + val sha1Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)) + ) val encodedBytes = Base64.getEncoder.encodeToString(bytes) val url = new URL(r._1.getUrl) val filename = FilenameUtils.getName(url.getPath()) @@ -398,8 +520,17 @@ package object archivesunleashed { } } val extension = GetExtensionMIME(url.getPath(), mimeType) - (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, - DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) + ( + r._1.getCrawlDate, + r._1.getUrl, + filename, + extension, + r._1.getMimeType, + DetectMimeTypeTika(r._1.getBinaryBytes), + md5Hash, + sha1Hash, + encodedBytes + ) }) .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) @@ -421,32 +552,45 @@ package object archivesunleashed { /* Extract presentation program bytes and presentation program metadata. */ def presentationProgramFiles(): DataFrame = { val records = rdd - .map(r => - (r, (DetectMimeTypeTika(r.getBinaryBytes))) - ) - .filter(r => r._2 == "application/vnd.ms-powerpoint" - || r._2 == "application/vnd.openxmlformats-officedocument.presentationml.presentation" - || r._2 == "application/vnd.oasis.opendocument.presentation" - || r._2 == "application/vnd.oasis.opendocument.presentation-template" - || r._2 == "application/vnd.sun.xml.impress" - || r._2 == "application/vnd.sun.xml.impress.template" - || r._2 == "application/vnd.stardivision.impress" - || r._2 == "application/x-starimpress" - || r._2 == "application/vnd.ms-powerpoint.addin.macroEnabled.12" - || r._2 == "application/vnd.ms-powerpoint.presentation.macroEnabled.12" - || r._2 == "application/vnd.ms-powerpoint.slide.macroEnabled.12" - || r._2 == "application/vnd.ms-powerpoint.slideshow.macroEnabled.12" - || r._2 == "application/vnd.ms-powerpoint.template.macroEnabled.12") + .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes)))) + .filter(r => + r._2 == "application/vnd.ms-powerpoint" + || r._2 == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + || r._2 == "application/vnd.oasis.opendocument.presentation" + || r._2 == "application/vnd.oasis.opendocument.presentation-template" + || r._2 == "application/vnd.sun.xml.impress" + || r._2 == "application/vnd.sun.xml.impress.template" + || r._2 == "application/vnd.stardivision.impress" + || r._2 == "application/x-starimpress" + || r._2 == "application/vnd.ms-powerpoint.addin.macroEnabled.12" + || r._2 == "application/vnd.ms-powerpoint.presentation.macroEnabled.12" + || r._2 == "application/vnd.ms-powerpoint.slide.macroEnabled.12" + || r._2 == "application/vnd.ms-powerpoint.slideshow.macroEnabled.12" + || r._2 == "application/vnd.ms-powerpoint.template.macroEnabled.12" + ) .map(r => { val bytes = r._1.getBinaryBytes - val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))) - val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))) + val md5Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)) + ) + val sha1Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)) + ) val encodedBytes = Base64.getEncoder.encodeToString(bytes) val url = new URL(r._1.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMIME(url.getPath(), r._2) - (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, - DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) + ( + r._1.getCrawlDate, + r._1.getUrl, + filename, + extension, + r._1.getMimeType, + DetectMimeTypeTika(r._1.getBinaryBytes), + md5Hash, + sha1Hash, + encodedBytes + ) }) .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) @@ -468,37 +612,50 @@ package object archivesunleashed { /* Extract word processor bytes and word processor metadata. */ def wordProcessorFiles(): DataFrame = { val records = rdd - .map(r => - (r, (DetectMimeTypeTika(r.getBinaryBytes))) - ) - .filter(r => r._2 == "application/vnd.lotus-wordpro" - || r._2 == "application/vnd.kde.kword" - || r._2 == "application/vnd.ms-word.document.macroEnabled.12" - || r._2 == "application/vnd.ms-word.template.macroEnabled.12" - || r._2 == "application/vnd.oasis.opendocument.text" - || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml" - || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" - || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml" - || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml" - || r._2 == "application/vnd.wordperfect" - || r._2 == "application/wordperfect5.1" - || r._2 == "application/msword" - || r._2 == "application/vnd.ms-word.document.macroEnabled.12" - || r._2 == "application/vnd.ms-word.template.macroEnabled.12" - || r._2 == "application/vnd.apple.pages" - || r._2 == "application/macwriteii" - || r._2 == "application/vnd.ms-works" - || r._2 == "application/rtf") + .map(r => (r, (DetectMimeTypeTika(r.getBinaryBytes)))) + .filter(r => + r._2 == "application/vnd.lotus-wordpro" + || r._2 == "application/vnd.kde.kword" + || r._2 == "application/vnd.ms-word.document.macroEnabled.12" + || r._2 == "application/vnd.ms-word.template.macroEnabled.12" + || r._2 == "application/vnd.oasis.opendocument.text" + || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml" + || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.glossary+xml" + || r._2 == "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml" + || r._2 == "application/vnd.wordperfect" + || r._2 == "application/wordperfect5.1" + || r._2 == "application/msword" + || r._2 == "application/vnd.ms-word.document.macroEnabled.12" + || r._2 == "application/vnd.ms-word.template.macroEnabled.12" + || r._2 == "application/vnd.apple.pages" + || r._2 == "application/macwriteii" + || r._2 == "application/vnd.ms-works" + || r._2 == "application/rtf" + ) .map(r => { val bytes = r._1.getBinaryBytes - val md5Hash = new String(Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))) - val sha1Hash = new String(Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))) + val md5Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes)) + ) + val sha1Hash = new String( + Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes)) + ) val encodedBytes = Base64.getEncoder.encodeToString(bytes) val url = new URL(r._1.getUrl) val filename = FilenameUtils.getName(url.getPath()) val extension = GetExtensionMIME(url.getPath(), r._2) - (r._1.getCrawlDate, r._1.getUrl, filename, extension, r._1.getMimeType, - DetectMimeTypeTika(r._1.getBinaryBytes), md5Hash, sha1Hash, encodedBytes) + ( + r._1.getCrawlDate, + r._1.getUrl, + filename, + extension, + r._1.getMimeType, + DetectMimeTypeTika(r._1.getBinaryBytes), + md5Hash, + sha1Hash, + encodedBytes + ) }) .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9)) @@ -519,8 +676,10 @@ package object archivesunleashed { /** Removes all data except images. */ def keepImages(): RDD[ArchiveRecord] = { - rdd.filter(r => r.getCrawlDate != null - && DetectMimeTypeTika(r.getBinaryBytes).startsWith("image/")) + rdd.filter(r => + r.getCrawlDate != null + && DetectMimeTypeTika(r.getBinaryBytes).startsWith("image/") + ) } /** Removes all data but selected mimeTypes specified. @@ -540,9 +699,9 @@ package object archivesunleashed { } /** Removes all data that does not have selected HTTP status codes. - * - * @param statusCodes a list of HTTP status codes - */ + * + * @param statusCodes a list of HTTP status codes + */ def keepHttpStatus(statusCodes: Set[String]): RDD[ArchiveRecord] = { rdd.filter(r => statusCodes.contains(r.getHttpStatus)) } @@ -552,7 +711,10 @@ package object archivesunleashed { * @param dates a list of dates * @param component the selected DateComponent enum value */ - def keepDate(dates: List[String], component: DateComponent = DateComponent.YYYYMMDD): RDD[ArchiveRecord] = { + def keepDate( + dates: List[String], + component: DateComponent = DateComponent.YYYYMMDD + ): RDD[ArchiveRecord] = { rdd.filter(r => dates.contains(ExtractDate(r.getCrawlDate, component))) } @@ -570,11 +732,15 @@ package object archivesunleashed { */ def keepUrlPatterns(urlREs: Set[Regex]): RDD[ArchiveRecord] = { rdd.filter(r => - urlREs.map(re => - r.getUrl match { - case re() => true - case _ => false - }).exists(identity)) + urlREs + .map(re => + r.getUrl match { + case re() => true + case _ => false + } + ) + .exists(identity) + ) } /** Removes all data but selected source domains. @@ -582,7 +748,9 @@ package object archivesunleashed { * @param urls a list of urls for the source domains */ def keepDomains(urls: Set[String]): RDD[ArchiveRecord] = { - rdd.filter(r => urls.contains(ExtractDomain(r.getUrl).replace("^\\s*www\\.", ""))) + rdd.filter(r => + urls.contains(ExtractDomain(r.getUrl).replace("^\\s*www\\.", "")) + ) } /** Removes all data not in selected language. @@ -590,7 +758,9 @@ package object archivesunleashed { * @param lang a set of ISO 639-2 codes */ def keepLanguages(lang: Set[String]): RDD[ArchiveRecord] = { - rdd.filter(r => lang.contains(DetectLanguage(RemoveHTML(r.getContentString)))) + rdd.filter(r => + lang.contains(DetectLanguage(RemoveHTML(r.getContentString))) + ) } /** Removes all content that does not pass Regular Expression test. @@ -599,11 +769,15 @@ package object archivesunleashed { */ def keepContent(contentREs: Set[Regex]): RDD[ArchiveRecord] = { rdd.filter(r => - contentREs.map(re => - (re findFirstIn r.getContentString) match { - case Some(v) => true - case None => false - }).exists(identity)) + contentREs + .map(re => + (re findFirstIn r.getContentString) match { + case Some(v) => true + case None => false + } + ) + .exists(identity) + ) } /** Filters ArchiveRecord MimeTypes (web server). @@ -647,16 +821,20 @@ package object archivesunleashed { } /** Filters detected URL patterns (regex). - * - * @param urlREs a list of Regular expressions - */ + * + * @param urlREs a list of Regular expressions + */ def discardUrlPatterns(urlREs: Set[Regex]): RDD[ArchiveRecord] = { rdd.filter(r => - !urlREs.map(re => - r.getUrl match { - case re() => true - case _ => false - }).exists(identity)) + !urlREs + .map(re => + r.getUrl match { + case re() => true + case _ => false + } + ) + .exists(identity) + ) } /** Filters detected domains (regex). @@ -673,11 +851,15 @@ package object archivesunleashed { */ def discardContent(contentREs: Set[Regex]): RDD[ArchiveRecord] = { rdd.filter(r => - !contentREs.map(re => - (re findFirstIn r.getContentString) match { - case Some(v) => true - case None => false - }).exists(identity)) + !contentREs + .map(re => + (re findFirstIn r.getContentString) match { + case Some(v) => true + case None => false + } + ) + .exists(identity) + ) } /** Filters detected language. @@ -685,7 +867,9 @@ package object archivesunleashed { * @param lang a set of ISO 639-2 codes */ def discardLanguages(lang: Set[String]): RDD[ArchiveRecord] = { - rdd.filter(r => !lang.contains(DetectLanguage(RemoveHTML(r.getContentString)))) + rdd.filter(r => + !lang.contains(DetectLanguage(RemoveHTML(r.getContentString))) + ) } } } diff --git a/src/main/scala/io/archivesunleashed/udfs/package.scala b/src/main/scala/io/archivesunleashed/udfs/package.scala index db6ddef8..30e6000e 100644 --- a/src/main/scala/io/archivesunleashed/udfs/package.scala +++ b/src/main/scala/io/archivesunleashed/udfs/package.scala @@ -16,57 +16,103 @@ package io.archivesunleashed -import io.archivesunleashed.matchbox.{ComputeImageSize, ComputeMD5, ComputeSHA1, - DetectLanguage, DetectMimeTypeTika, - ExtractBoilerpipeText, ExtractDate, - ExtractDomain, ExtractImageLinks, ExtractLinks, - GetExtensionMIME, RemoveHTML, RemoveHTTPHeader} +import io.archivesunleashed.matchbox.{ + ComputeImageSize, + ComputeMD5, + ComputeSHA1, + DetectLanguage, + DetectMimeTypeTika, + ExtractBoilerpipeText, + ExtractDate, + ExtractDomain, + ExtractImageLinks, + ExtractLinks, + GetExtensionMIME, + RemoveHTML, + RemoveHTTPHeader +} import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.udf import org.apache.spark.sql.SparkSession import scala.util.matching.Regex -/** Package object providing UDFs for DataFrames in Scala and PySpark. **/ +/** Package object providing UDFs for DataFrames in Scala and PySpark. * */ package object udfs extends Serializable { // Matchbox - def computeImageSize: UserDefinedFunction = udf(ComputeImageSize.apply(_: Array[Byte])) + def computeImageSize: UserDefinedFunction = + udf(ComputeImageSize.apply(_: Array[Byte])) def computeMD5: UserDefinedFunction = udf(ComputeMD5.apply(_: Array[Byte])) def computeSHA1: UserDefinedFunction = udf(ComputeSHA1.apply(_: Array[Byte])) def detectLanguage: UserDefinedFunction = udf(DetectLanguage.apply(_: String)) - def detectMimeTypeTika: UserDefinedFunction = udf(DetectMimeTypeTika.apply(_: Array[Byte])) - def extractBoilerpipeText: UserDefinedFunction = udf(ExtractBoilerpipeText.apply(_: String)) - def extractDate: UserDefinedFunction = udf(ExtractDate.apply(_: String, _: String)) - def extractDomain: UserDefinedFunction = udf(ExtractDomain.apply(_: String, "")) - def extractImageLinks: UserDefinedFunction = udf(ExtractImageLinks.apply(_: String, _: String)) - def extractLinks: UserDefinedFunction = udf(ExtractLinks.apply(_: String, _: String)) - def getExtensionMime: UserDefinedFunction = udf(GetExtensionMIME.apply(_: String, _: String)) + def detectMimeTypeTika: UserDefinedFunction = + udf(DetectMimeTypeTika.apply(_: Array[Byte])) + def extractBoilerpipeText: UserDefinedFunction = + udf(ExtractBoilerpipeText.apply(_: String)) + def extractDate: UserDefinedFunction = + udf(ExtractDate.apply(_: String, _: String)) + def extractDomain: UserDefinedFunction = + udf(ExtractDomain.apply(_: String, "")) + def extractImageLinks: UserDefinedFunction = + udf(ExtractImageLinks.apply(_: String, _: String)) + def extractLinks: UserDefinedFunction = + udf(ExtractLinks.apply(_: String, _: String)) + def getExtensionMime: UserDefinedFunction = + udf(GetExtensionMIME.apply(_: String, _: String)) def removeHTML: UserDefinedFunction = udf(RemoveHTML.apply(_: String)) - def removeHTTPHeader: UserDefinedFunction = udf(RemoveHTTPHeader.apply(_: String)) - def removePrefixWWW: UserDefinedFunction = udf[String, String](_.replaceAll("^\\s*www\\.", "")) + def removeHTTPHeader: UserDefinedFunction = + udf(RemoveHTTPHeader.apply(_: String)) + def removePrefixWWW: UserDefinedFunction = + udf[String, String](_.replaceAll("^\\s*www\\.", "")) // Filters - def hasContent: UserDefinedFunction = udf((c: String, contentREs: Seq[String]) => { - contentREs.map(re => - (re.r findFirstIn c) match { - case Some(v) => true - case None => false - }).exists(identity) - }) - def hasDate: UserDefinedFunction = udf((date_ : String, date: Seq[String]) => date.contains(date_)) - def hasDomains: UserDefinedFunction = udf((domain: String, domains: Seq[String]) => domains.contains(domain)) - def hasHTTPStatus: UserDefinedFunction = udf((statusCode: String, statusCodes: Seq[String]) => statusCodes.contains(statusCode)) - def hasImages: UserDefinedFunction = udf((date: String, mimeType: String) => date != null && mimeType.startsWith("image/")) - def hasLanguages: UserDefinedFunction = udf((language: String, languages: Seq[String]) => languages.contains(language)) - def hasMIMETypes: UserDefinedFunction = udf((mimeType: String, mimeTypes: Seq[String]) => mimeTypes.contains(mimeType)) - def hasMIMETypesTika: UserDefinedFunction = udf((mimeType: String, mimeTypesTika: Seq[String]) => mimeTypesTika.contains(mimeType)) - def hasUrlPatterns: UserDefinedFunction = udf((urlPattern: String, urlREs: Seq[String]) => { - urlREs.map(re => - urlPattern match { - case re.r() => true - case _ => false - }).exists(identity) - }) - def hasUrls: UserDefinedFunction = udf((url: String, urls: Seq[String]) => urls.contains(url)) + def hasContent: UserDefinedFunction = + udf((c: String, contentREs: Seq[String]) => { + contentREs + .map(re => + (re.r findFirstIn c) match { + case Some(v) => true + case None => false + } + ) + .exists(identity) + }) + def hasDate: UserDefinedFunction = + udf((date_ : String, date: Seq[String]) => date.contains(date_)) + def hasDomains: UserDefinedFunction = + udf((domain: String, domains: Seq[String]) => domains.contains(domain)) + def hasHTTPStatus: UserDefinedFunction = + udf((statusCode: String, statusCodes: Seq[String]) => + statusCodes.contains(statusCode) + ) + def hasImages: UserDefinedFunction = + udf((date: String, mimeType: String) => + date != null && mimeType.startsWith("image/") + ) + def hasLanguages: UserDefinedFunction = + udf((language: String, languages: Seq[String]) => + languages.contains(language) + ) + def hasMIMETypes: UserDefinedFunction = + udf((mimeType: String, mimeTypes: Seq[String]) => + mimeTypes.contains(mimeType) + ) + def hasMIMETypesTika: UserDefinedFunction = + udf((mimeType: String, mimeTypesTika: Seq[String]) => + mimeTypesTika.contains(mimeType) + ) + def hasUrlPatterns: UserDefinedFunction = + udf((urlPattern: String, urlREs: Seq[String]) => { + urlREs + .map(re => + urlPattern match { + case re.r() => true + case _ => false + } + ) + .exists(identity) + }) + def hasUrls: UserDefinedFunction = + udf((url: String, urls: Seq[String]) => urls.contains(url)) } diff --git a/src/test/scala/io/archivesunleashed/ArcTest.scala b/src/test/scala/io/archivesunleashed/ArcTest.scala index ec3852d2..a879338c 100644 --- a/src/test/scala/io/archivesunleashed/ArcTest.scala +++ b/src/test/scala/io/archivesunleashed/ArcTest.scala @@ -18,7 +18,13 @@ package io.archivesunleashed import com.google.common.io.Resources import io.archivesunleashed.matchbox.ExtractDate.DateComponent -import io.archivesunleashed.matchbox.{DetectLanguage, DetectMimeTypeTika, ExtractLinks, RemoveHTML, RemoveHTTPHeader} +import io.archivesunleashed.matchbox.{ + DetectLanguage, + DetectMimeTypeTika, + ExtractLinks, + RemoveHTML, + RemoveHTTPHeader +} import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @@ -48,38 +54,46 @@ class ArcTest extends FunSuite with BeforeAndAfter { test("Filter date RDD") { val startSS = 0 val monthSS = 6 - val four = RecordLoader.loadArchives(arcPath, sc) + val four = RecordLoader + .loadArchives(arcPath, sc) .keepDate(List("200804", dayMonthTestA), DateComponent.YYYYMM) .map(r => r.getCrawlDate) .collect() - val five = RecordLoader.loadArchives(arcPath, sc) - .keepDate(List(dayMonthTestA,"200807"), DateComponent.YYYYMM) + val five = RecordLoader + .loadArchives(arcPath, sc) + .keepDate(List(dayMonthTestA, "200807"), DateComponent.YYYYMM) .map(r => r.getCrawlDate) .collect() four.foreach(date => assert(date.substring(startSS, monthSS) == "200804")) - five.foreach(date => assert(date.substring(startSS, monthSS) == dayMonthTestA)) + five.foreach(date => + assert(date.substring(startSS, monthSS) == dayMonthTestA) + ) } test("Filter URL pattern RDD") { - val keepMatches = RecordLoader.loadArchives(arcPath, sc) + val keepMatches = RecordLoader + .loadArchives(arcPath, sc) .keepUrlPatterns(Set("http://www.archive.org/about/.*".r)) - val discardMatches = RecordLoader.loadArchives(arcPath, sc) - .discardUrlPatterns(Set("http://www.archive.org/about/.*".r)) + val discardMatches = RecordLoader + .loadArchives(arcPath, sc) + .discardUrlPatterns(Set("http://www.archive.org/about/.*".r)) assert(keepMatches.count == 16L) assert(discardMatches.count == 284L) } test("Count links RDD") { - val links = RecordLoader.loadArchives(arcPath, sc) + val links = RecordLoader + .loadArchives(arcPath, sc) .map(r => ExtractLinks(r.getUrl, r.getContentString)) .reduce((a, b) => a ++ b) assert(links.size == 664) } test("Detect language RDD") { - val languageCounts = RecordLoader.loadArchives(arcPath, sc) + val languageCounts = RecordLoader + .loadArchives(arcPath, sc) .keepMimeTypes(Set("text/html")) .map(r => RemoveHTML(r.getContentString)) .groupBy(content => DetectLanguage(content)) @@ -95,30 +109,32 @@ class ArcTest extends FunSuite with BeforeAndAfter { case ("lt", count) => assert(61L == count) case ("no", count) => assert(6L == count) case ("ro", count) => assert(4L == count) - case (_, count) => print(_) + case (_, count) => print(_) } } test("Detect MIMEtype Tika RDD") { - val mimeTypeCounts = RecordLoader.loadArchives(arcPath, sc) + val mimeTypeCounts = RecordLoader + .loadArchives(arcPath, sc) .map(r => RemoveHTTPHeader(r.getContentString)) .groupBy(content => DetectMimeTypeTika(content.getBytes)) .map(f => { (f._1, f._2.size) - }).collect + }) + .collect mimeTypeCounts.foreach { - case ("image/gif", count) => assert(29L == count) - case ("image/png", count) => assert(8L == count) - case ("image/jpeg", count) => assert(18L == count) - case ("text/html", count) => assert(132L == count) - case ("text/plain", count) => assert(86L == count) - case ("application/xml", count) => assert(1L == count) - case ("application/rss+xml", count) => assert(9L == count) - case ("application/xhtml+xml", count) => assert(1L == count) - case ("application/octet-stream", count) => assert(26L == count) + case ("image/gif", count) => assert(29L == count) + case ("image/png", count) => assert(8L == count) + case ("image/jpeg", count) => assert(18L == count) + case ("text/html", count) => assert(132L == count) + case ("text/plain", count) => assert(86L == count) + case ("application/xml", count) => assert(1L == count) + case ("application/rss+xml", count) => assert(9L == count) + case ("application/xhtml+xml", count) => assert(1L == count) + case ("application/octet-stream", count) => assert(26L == count) case ("application/x-shockwave-flash", count) => assert(8L == count) - case (_, count) => print(_) + case (_, count) => print(_) } } diff --git a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala index 0cdc196b..29d461dd 100644 --- a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala +++ b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala @@ -52,79 +52,144 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { } test("Resource name produces expected result") { - val textSampleArc = RecordLoader.loadArchives(arcPath, sc) - .map(x => FilenameUtils.getName(x.getArchiveFilename)) - .take(3) - val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) - .map(x => FilenameUtils.getName(x.getArchiveFilename)).take(3) - assert(textSampleArc.deep == Array(exampleArc, - exampleArc, exampleArc).deep) - assert(textSampleWarc.deep == Array(exampleWarc, - exampleWarc, exampleWarc).deep) + val textSampleArc = RecordLoader + .loadArchives(arcPath, sc) + .map(x => FilenameUtils.getName(x.getArchiveFilename)) + .take(3) + val textSampleWarc = RecordLoader + .loadArchives(warcPath, sc) + .map(x => FilenameUtils.getName(x.getArchiveFilename)) + .take(3) + assert(textSampleArc.deep == Array(exampleArc, exampleArc, exampleArc).deep) + assert( + textSampleWarc.deep == Array(exampleWarc, exampleWarc, exampleWarc).deep + ) } test("Crawl Dates") { - val textSampleArc = RecordLoader.loadArchives(arcPath, sc) - .map(x => x.getCrawlDate).take(3) - val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) - .map(x => x.getCrawlDate).take(3) - assert(textSampleArc.deep == Array(exampleDate, exampleDate, exampleDate).deep) - assert(textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep) + val textSampleArc = RecordLoader + .loadArchives(arcPath, sc) + .map(x => x.getCrawlDate) + .take(3) + val textSampleWarc = RecordLoader + .loadArchives(warcPath, sc) + .map(x => x.getCrawlDate) + .take(3) + assert( + textSampleArc.deep == Array(exampleDate, exampleDate, exampleDate).deep + ) + assert( + textSampleWarc.deep == Array(exampleDate, exampleDate, exampleDate).deep + ) } test("Domains") { - val textSampleArc = RecordLoader.loadArchives(arcPath, sc) - .map(x => x.getDomain).take(3) - val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) - .map(x => x.getDomain).take(3) + val textSampleArc = RecordLoader + .loadArchives(arcPath, sc) + .map(x => x.getDomain) + .take(3) + val textSampleWarc = RecordLoader + .loadArchives(warcPath, sc) + .map(x => x.getDomain) + .take(3) assert(textSampleArc.deep == Array("", "", exampleUrl).deep) assert(textSampleWarc.deep == Array("", exampleUrl, exampleUrl).deep) } test("URLs") { - val textSampleArc = RecordLoader.loadArchives(arcPath, sc) - .map(x => x.getUrl).take(3) - val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) - .map(x => x.getUrl).take(3) - assert(textSampleArc.deep == Array("filedesc://IAH-20080430204825-00000-blackbook.arc", - "dns:www.archive.org", "http://www.archive.org/robots.txt").deep) - assert(textSampleWarc.deep == Array("dns:www.archive.org", - "http://www.archive.org/robots.txt", "http://www.archive.org/").deep) + val textSampleArc = RecordLoader + .loadArchives(arcPath, sc) + .map(x => x.getUrl) + .take(3) + val textSampleWarc = RecordLoader + .loadArchives(warcPath, sc) + .map(x => x.getUrl) + .take(3) + assert( + textSampleArc.deep == Array( + "filedesc://IAH-20080430204825-00000-blackbook.arc", + "dns:www.archive.org", + "http://www.archive.org/robots.txt" + ).deep + ) + assert( + textSampleWarc.deep == Array( + "dns:www.archive.org", + "http://www.archive.org/robots.txt", + "http://www.archive.org/" + ).deep + ) } test("MIMEtype") { - val textSampleArc = RecordLoader.loadArchives(arcPath, sc) - .map(x => x.getMimeType).take(3) - val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) - .map(x => x.getMimeType).take(3) - assert (textSampleArc.deep == Array (exampleMimeType, "text/dns", - exampleMimeType).deep) - assert (textSampleWarc.deep == Array("unknown", exampleMimeType, - "text/html").deep) + val textSampleArc = RecordLoader + .loadArchives(arcPath, sc) + .map(x => x.getMimeType) + .take(3) + val textSampleWarc = RecordLoader + .loadArchives(warcPath, sc) + .map(x => x.getMimeType) + .take(3) + assert( + textSampleArc.deep == Array( + exampleMimeType, + "text/dns", + exampleMimeType + ).deep + ) + assert( + textSampleWarc.deep == Array("unknown", exampleMimeType, "text/html").deep + ) } test("Get HTTP status") { - val textSampleArc = RecordLoader.loadArchives(arcPath, sc) - .map(x => x.getHttpStatus).take(3) - val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) - .map(x => x.getHttpStatus).take(3) - assert (textSampleArc.deep == Array(exampleStatusCode1, exampleStatusCode1, - exampleStatusCode2).deep) - assert (textSampleWarc.deep == Array(exampleStatusCode1, exampleStatusCode2, - exampleStatusCode2).deep) + val textSampleArc = RecordLoader + .loadArchives(arcPath, sc) + .map(x => x.getHttpStatus) + .take(3) + val textSampleWarc = RecordLoader + .loadArchives(warcPath, sc) + .map(x => x.getHttpStatus) + .take(3) + assert( + textSampleArc.deep == Array( + exampleStatusCode1, + exampleStatusCode1, + exampleStatusCode2 + ).deep + ) + assert( + textSampleWarc.deep == Array( + exampleStatusCode1, + exampleStatusCode2, + exampleStatusCode2 + ).deep + ) } test("Get Payload Digest") { - val textSampleArc = RecordLoader.loadArchives(arcPath, sc) - .map(x => x.getPayloadDigest).take(3) - val textSampleWarc = RecordLoader.loadArchives(warcPath, sc) - .map(x => x.getPayloadDigest).take(3) - assert (textSampleArc.deep == Array("sha1:252efd6dd414d91812dd9b0f897cdb2b44f64601", - "sha1:8d115d0e83c5dcd66b13619e04d60a36cb2c1ee4", - "sha1:ede22581685942721c7b9743dced317633d00e33").deep) - assert (textSampleWarc.deep == Array(null, - "sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U", - "sha1:2WAXX5NUWNNCS2BDKCO5OVDQBJVNKIVV").deep) + val textSampleArc = RecordLoader + .loadArchives(arcPath, sc) + .map(x => x.getPayloadDigest) + .take(3) + val textSampleWarc = RecordLoader + .loadArchives(warcPath, sc) + .map(x => x.getPayloadDigest) + .take(3) + assert( + textSampleArc.deep == Array( + "sha1:252efd6dd414d91812dd9b0f897cdb2b44f64601", + "sha1:8d115d0e83c5dcd66b13619e04d60a36cb2c1ee4", + "sha1:ede22581685942721c7b9743dced317633d00e33" + ).deep + ) + assert( + textSampleWarc.deep == Array( + null, + "sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U", + "sha1:2WAXX5NUWNNCS2BDKCO5OVDQBJVNKIVV" + ).deep + ) } after { diff --git a/src/test/scala/io/archivesunleashed/CountableRDDTest.scala b/src/test/scala/io/archivesunleashed/CountableRDDTest.scala index 395479ae..ec099531 100644 --- a/src/test/scala/io/archivesunleashed/CountableRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/CountableRDDTest.scala @@ -39,7 +39,8 @@ class CountableRDDTest extends FunSuite with BeforeAndAfter { } test("Count records; Extract Domain RDD ") { - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() .map(r => ExtractDomain(r.getUrl)) val r = base diff --git a/src/test/scala/io/archivesunleashed/RecordDFTest.scala b/src/test/scala/io/archivesunleashed/RecordDFTest.scala index 994ffc0d..978bbcfb 100644 --- a/src/test/scala/io/archivesunleashed/RecordDFTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordDFTest.scala @@ -16,11 +16,22 @@ package io.archivesunleashed -import io.archivesunleashed.udfs.{detectLanguage, detectMimeTypeTika, - extractDomain, removeHTML, - hasContent, hasDate, hasDomains, hasHTTPStatus, - hasImages, hasLanguages, hasMIMETypes, - hasMIMETypesTika, hasUrlPatterns, hasUrls} +import io.archivesunleashed.udfs.{ + detectLanguage, + detectMimeTypeTika, + extractDomain, + removeHTML, + hasContent, + hasDate, + hasDomains, + hasHTTPStatus, + hasImages, + hasLanguages, + hasMIMETypes, + hasMIMETypesTika, + hasUrlPatterns, + hasUrls +} import com.google.common.io.Resources import org.apache.spark.sql.functions.lit import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} @@ -46,12 +57,13 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { test("Keep valid pages DF") { val expected = "http://www.archive.org/" - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .all() .keepValidPagesDF() .take(1)(0)(1) - assert (base.toString == expected) + assert(base.toString == expected) } test("Has HTTP Status") { @@ -61,13 +73,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { // scalastyle:on val expected = "000" - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .all() .select($"http_status_code") - .filter(hasHTTPStatus($"http_status_code", lit(Array("200","000")))) + .filter(hasHTTPStatus($"http_status_code", lit(Array("200", "000")))) .take(1)(0)(0) - assert (base.toString == expected) + assert(base.toString == expected) } test("Has URLs") { @@ -78,20 +91,32 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { val expected1 = "http://www.archive.org/robots.txt" val expected2 = "http://www.archive.org/" - val base1 = RecordLoader.loadArchives(arcPath, sc) + val base1 = RecordLoader + .loadArchives(arcPath, sc) .all() .select($"url") - .filter(hasUrls($"url", lit(Array("http://www.archive.org/","http://www.archive.org/robots.txt")))) + .filter( + hasUrls( + $"url", + lit( + Array( + "http://www.archive.org/", + "http://www.archive.org/robots.txt" + ) + ) + ) + ) .take(1)(0)(0) - val base2 = RecordLoader.loadArchives(arcPath, sc) + val base2 = RecordLoader + .loadArchives(arcPath, sc) .all() .select($"url") .filter(hasUrls($"url", lit(Array("http://www.archive.org/")))) .take(1)(0)(0) - assert (base1.toString == expected1) - assert (base2.toString == expected2) + assert(base1.toString == expected1) + assert(base2.toString == expected2) } test("Has domains") { @@ -101,13 +126,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { // scalastyle:on val expected = "http://www.archive.org/robots.txt" - val base1 = RecordLoader.loadArchives(arcPath, sc) + val base1 = RecordLoader + .loadArchives(arcPath, sc) .all() .select($"url") .filter(hasDomains(extractDomain($"url"), lit(Array("www.archive.org")))) .take(1)(0)(0) - assert (base1.toString == expected) + assert(base1.toString == expected) } test("Has MIME Types") { @@ -117,13 +143,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { // scalastyle:on val expected = "text/html" - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .all() .select($"mime_type_web_server") .filter(hasMIMETypes($"mime_type_web_server", lit(Array("text/html")))) .take(1)(0)(0) - assert (base.toString == expected) + assert(base.toString == expected) } test("Has MIME Types Tika") { @@ -133,13 +160,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { // scalastyle:on val expected = "text/html" - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .all() .select($"mime_type_web_server") .filter(hasMIMETypesTika($"mime_type_tika", lit(Array("text/html")))) .take(1)(0)(0) - assert (base.toString == expected) + assert(base.toString == expected) } test("Has Content") { @@ -149,13 +177,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { // scalastyle:on val expected = "http://www.archive.org/images/logoc.jpg" - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .all() - .select($"url",$"content") + .select($"url", $"content") .filter(hasContent($"content", lit(Array("Content-Length: [0-9]{4}")))) .take(1)(0)(0) - assert (base.toString == expected) + assert(base.toString == expected) } test("Has URL Patterns") { @@ -165,21 +194,23 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { // scalastyle:on val expected1 = "http://www.archive.org/images/go-button-gateway.gif" - val base1 = RecordLoader.loadArchives(arcPath, sc) + val base1 = RecordLoader + .loadArchives(arcPath, sc) .all() .select($"url") .filter(hasUrlPatterns($"url", lit(Array(".*images.*")))) .take(2)(1)(0) val expected2 = "http://www.archive.org/index.php?skin=classic" - val base2 = RecordLoader.loadArchives(arcPath, sc) + val base2 = RecordLoader + .loadArchives(arcPath, sc) .all() .select($"url") .filter(hasUrlPatterns($"url", lit(Array(".*index.*")))) .take(3)(1)(0) - assert (base1.toString == expected1) - assert (base2.toString == expected2) + assert(base1.toString == expected1) + assert(base2.toString == expected2) } test("Has Languages") { @@ -189,13 +220,19 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { // scalastyle:on val expected = "de" - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .all() .select(detectLanguage(removeHTML($"content")).as("language")) - .filter(hasLanguages(detectLanguage(removeHTML($"content")), lit(Array("de","ht")))) + .filter( + hasLanguages( + detectLanguage(removeHTML($"content")), + lit(Array("de", "ht")) + ) + ) .take(1)(0)(0) - assert (base.toString == expected) + assert(base.toString == expected) } test("Has Images") { @@ -205,13 +242,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { // scalastyle:on val expected = "image/jpeg" - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .all() .select($"mime_type_tika") .filter(hasImages($"crawl_date", detectMimeTypeTika($"bytes"))) .take(1)(0)(0) - assert (base.toString == expected) + assert(base.toString == expected) } test("Has Date") { @@ -221,13 +259,14 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { // scalastyle:on val expected = Array("20080430") - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .all() .select($"crawl_date") .filter(hasDate($"crawl_date", lit(expected))) .take(1)(0)(0) - assert (base.toString == "20080430") + assert(base.toString == "20080430") } after { diff --git a/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala b/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala index 9260e666..f5a7c802 100644 --- a/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordLoaderTest.scala @@ -38,7 +38,8 @@ class RecordLoaderTest extends FunSuite with BeforeAndAfter { } test("Load WARC") { - val base = RecordLoader.loadArchives(warcPath, sc) + val base = RecordLoader + .loadArchives(warcPath, sc) .keepValidPages() .map(x => x.getUrl) .take(1) diff --git a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala index 6c5efcd6..a324aa33 100644 --- a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala @@ -45,16 +45,20 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { test("Expect no valid pages RDD") { val expectedLength = 0 - val base = RecordLoader.loadArchives(badPath, sc) - .keepValidPages().take(2) - assert (base.length == expectedLength) + val base = RecordLoader + .loadArchives(badPath, sc) + .keepValidPages() + .take(2) + assert(base.length == expectedLength) } - test ("Expect no images RDD") { + test("Expect no images RDD") { val expectedLength = 0 - val base = RecordLoader.loadArchives(badPath, sc) - .keepValidPages().take(2) - assert (base.length == expectedLength) + val base = RecordLoader + .loadArchives(badPath, sc) + .keepValidPages() + .take(2) + assert(base.length == expectedLength) } test("Keep date RDD") { @@ -62,170 +66,214 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { val base = RecordLoader.loadArchives(arcPath, sc) val component = DateComponent.YYYY val r = base - .filter (x => ExtractDate(x.getCrawlDate, component) == testDate) - .map ( mp => mp.getUrl).take(3) - val r2 = base.keepDate(List(testDate), component) - .map ( mp => mp.getUrl).take(3) - assert (r2.sameElements(r)) } + .filter(x => ExtractDate(x.getCrawlDate, component) == testDate) + .map(mp => mp.getUrl) + .take(3) + val r2 = base + .keepDate(List(testDate), component) + .map(mp => mp.getUrl) + .take(3) + assert(r2.sameElements(r)) + } - test ("Keep HTTP status codes RDD") { + test("Keep HTTP status codes RDD") { val expected = 94 - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() - val statusCodes: Set[String] = Set ("200", "404") + val statusCodes: Set[String] = Set("200", "404") val r2 = base.keepHttpStatus(statusCodes).count - assert (r2 == expected) + assert(r2 == expected) } - test ("Keep URLs RDD") { + test("Keep URLs RDD") { val expected = 1 - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() - val urls: Set[String] = Set (archive, sloan) + val urls: Set[String] = Set(archive, sloan) val r2 = base.keepUrls(urls).count - assert (r2 == expected) + assert(r2 == expected) } - test ("Keep URL patterns RDD") { + test("Keep URL patterns RDD") { val expected = 1 - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() - val urls = Set (archive.r, sloan.r, "".r) + val urls = Set(archive.r, sloan.r, "".r) val r2 = base.keepUrlPatterns(urls).count - assert (r2 == expected) + assert(r2 == expected) } - test ("Keep domains RDD") { + test("Keep domains RDD") { val expected = 91 - val base2 = RecordLoader.loadArchives(arcPath, sc) + val base2 = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() val urls: Set[String] = Set("www.archive.org", "www.sloan.org") val x2 = base2.keepDomains(urls).count() - assert (x2 == expected ) + assert(x2 == expected) } - test ("Keep languages RDD") { - val base2 = RecordLoader.loadArchives(arcPath, sc) + test("Keep languages RDD") { + val base2 = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() val langs: Set[String] = Set("en", "fr") - val r = Array("http://www.archive.org/", - "http://www.archive.org/index.php") - val r2 = base2.keepLanguages(langs) - .map(r => r.getUrl).take(2) - assert (r2.sameElements(r)) + val r = Array("http://www.archive.org/", "http://www.archive.org/index.php") + val r2 = base2 + .keepLanguages(langs) + .map(r => r.getUrl) + .take(2) + assert(r2.sameElements(r)) } - test ("Discard languages RDD") { - val base2 = RecordLoader.loadArchives(arcPath, sc) + test("Discard languages RDD") { + val base2 = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() val langs: Set[String] = Set("fr") val r = Array("http://www.archive.org/", "http://www.archive.org/index.php") - val r2 = base2.discardLanguages(langs) - .map(r => r.getUrl).take(2) - assert (r2.sameElements(r)) + val r2 = base2 + .discardLanguages(langs) + .map(r => r.getUrl) + .take(2) + assert(r2.sameElements(r)) } - test ("Keep MIMEtype Tika RDD") { + test("Keep MIMEtype Tika RDD") { val base = RecordLoader.loadArchives(arcPath, sc) - val mime = Set ("text/plain", "image/jpeg") - val r2 = base.keepMimeTypesTika(mime) - .map (mp => mp.getUrl).take(3) - assert (r2.deep == Array("dns:www.archive.org", - "http://www.archive.org/robots.txt", - "http://www.archive.org/images/logoc.jpg").deep) + val mime = Set("text/plain", "image/jpeg") + val r2 = base + .keepMimeTypesTika(mime) + .map(mp => mp.getUrl) + .take(3) + assert( + r2.deep == Array( + "dns:www.archive.org", + "http://www.archive.org/robots.txt", + "http://www.archive.org/images/logoc.jpg" + ).deep + ) } - test ("Keep MIMEtype RDD") { + test("Keep MIMEtype RDD") { val base = RecordLoader.loadArchives(arcPath, sc) - val mime = Set ("text/plain", "image/jpeg") - val r2 = base.keepMimeTypes(mime) - .map (mp => mp.getUrl).take(3) - assert (r2.deep == Array("filedesc://IAH-20080430204825-00000-blackbook.arc", - "http://www.archive.org/robots.txt", - "http://www.archive.org/images/logoc.jpg").deep) + val mime = Set("text/plain", "image/jpeg") + val r2 = base + .keepMimeTypes(mime) + .map(mp => mp.getUrl) + .take(3) + assert( + r2.deep == Array( + "filedesc://IAH-20080430204825-00000-blackbook.arc", + "http://www.archive.org/robots.txt", + "http://www.archive.org/images/logoc.jpg" + ).deep + ) } - test ("Keep content RDD"){ + test("Keep content RDD") { val expected = 1 - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() val regno = Set(regex, raw"UNINTELLIBLEDFSJKLS".r) val y2 = base.keepContent(Set(regex)).count() val y1 = base.keepContent(regno).count() - assert (y2 == expected) - assert (y1 == expected) + assert(y2 == expected) + assert(y1 == expected) } - test ("Discard MIMEtype RDD") { + test("Discard MIMEtype RDD") { val base = RecordLoader.loadArchives(arcPath, sc) - val mime = Set ("text/plain", "image/jpeg") - val r2 = base.discardMimeTypes(mime) - .map (mp => mp.getUrl).take(3) - assert (r2.deep == Array("dns:www.archive.org", archive, - "http://www.archive.org/index.php").deep) + val mime = Set("text/plain", "image/jpeg") + val r2 = base + .discardMimeTypes(mime) + .map(mp => mp.getUrl) + .take(3) + assert( + r2.deep == Array( + "dns:www.archive.org", + archive, + "http://www.archive.org/index.php" + ).deep + ) } - test ("Discard MIMEtype Tika RDD") { + test("Discard MIMEtype Tika RDD") { val base = RecordLoader.loadArchives(arcPath, sc) - val mime = Set ("text/plain", "image/jpeg") - val r2 = base.discardMimeTypesTika(mime) - .map (mp => mp.getUrl).take(3) - assert (r2.deep == Array("filedesc://IAH-20080430204825-00000-blackbook.arc", - "http://www.archive.org/", "http://www.archive.org/index.php").deep) + val mime = Set("text/plain", "image/jpeg") + val r2 = base + .discardMimeTypesTika(mime) + .map(mp => mp.getUrl) + .take(3) + assert( + r2.deep == Array( + "filedesc://IAH-20080430204825-00000-blackbook.arc", + "http://www.archive.org/", + "http://www.archive.org/index.php" + ).deep + ) } - test ("Discard date RDD") { + test("Discard date RDD") { val base = RecordLoader.loadArchives(arcPath, sc) val date = "20080430" - val r = base.filter( x=> x.getCrawlDate != date).collect() + val r = base.filter(x => x.getCrawlDate != date).collect() val r2 = base.discardDate(date).take(3) - assert (r.deep == Array().deep) + assert(r.deep == Array().deep) } - test ("Discard URLs RDD") { + test("Discard URLs RDD") { val expected = 94 - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() - val urls: Set[String] = Set (sloan) + val urls: Set[String] = Set(sloan) val r2 = base.discardUrls(urls).count() - assert (r2 == expected) + assert(r2 == expected) } - test ("Discard URL patterns RDD") { + test("Discard URL patterns RDD") { val expected = 93 - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() - val urls = Set (archive.r, sloan.r, "".r) + val urls = Set(archive.r, sloan.r, "".r) val r2 = base.discardUrlPatterns(urls).count - assert (r2 == expected) + assert(r2 == expected) } - test ("Discard HTTP status codes RDD") { + test("Discard HTTP status codes RDD") { val expected = 46 val base = RecordLoader.loadArchives(arcPath, sc) - val statusCodes: Set[String] = Set ("200", "404") + val statusCodes: Set[String] = Set("200", "404") val r2 = base.discardHttpStatus(statusCodes).count - assert (r2 == expected) + assert(r2 == expected) } - test ("Discard domains RDD") { + test("Discard domains RDD") { val expected = 94 - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() - val urls: Set[String] = Set ("www.sloan.org") + val urls: Set[String] = Set("www.sloan.org") val r2 = base.discardDomains(urls).count() - assert (r2 == expected) + assert(r2 == expected) } - test ("Discard content RDD") { + test("Discard content RDD") { val expected = 93 - val base = RecordLoader.loadArchives(arcPath, sc) + val base = RecordLoader + .loadArchives(arcPath, sc) .keepValidPages() val regno = Set(regex, raw"UNINTELLIBLEDFSJKLS".r) val y2 = base.discardContent(Set(regex)).count() val y1 = base.discardContent(regno).count() - assert (y2 == expected) - assert (y1 == expected) + assert(y2 == expected) + assert(y1 == expected) } after { diff --git a/src/test/scala/io/archivesunleashed/WarcTest.scala b/src/test/scala/io/archivesunleashed/WarcTest.scala index 5eb2b112..44b9879f 100644 --- a/src/test/scala/io/archivesunleashed/WarcTest.scala +++ b/src/test/scala/io/archivesunleashed/WarcTest.scala @@ -58,7 +58,8 @@ class WarcTest extends FunSuite with BeforeAndAfter { } test("WARC get content RDD") { - val a = RecordLoader.loadArchives(warcPath, sc) + val a = RecordLoader + .loadArchives(warcPath, sc) .map(r => r.getContentString) .take(1) assert(a.head.nonEmpty) diff --git a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala index ad8730af..b579b905 100644 --- a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala @@ -25,7 +25,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class AudioInformationExtractorTest extends FunSuite with BeforeAndAfter { - private val arcPath = Resources.getResource("warc/example.media.warc.gz").getPath + private val arcPath = + Resources.getResource("warc/example.media.warc.gz").getPath private var sc: SparkContext = _ private val master = "local[4]" private val appName = "example-spark" diff --git a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala index 7083fce2..2e6fba90 100644 --- a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala +++ b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala @@ -43,78 +43,722 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter { private val webPagesOpt = "WebPagesExtractor" private var sc: SparkContext = _ private val testSuccessCmds = Array( - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor", "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor", "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "DomainFrequencyExtractor", "--output-format", "parquet", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "gexf"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "graphml"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, domainGraphOpt, "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, plainTextOpt, "--output-format", "parquet", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, "--partition", "1", extractOpt, plainTextOpt), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, imageGraphOpt, "--output-format", "parquet", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, webPagesOpt, "--output-format", "parquet", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "AudioInformationExtractor", "--output-format", "parquet", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "ImageInformationExtractor", "--output-format", "parquet", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PDFInformationExtractor", "--output-format", "parquet", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "PresentationProgramInformationExtractor", "--output-format", "parquet", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "SpreadsheetInformationExtractor", "--output-format", "parquet", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "VideoInformationExtractor", "--output-format", "parquet", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WordProcessorInformationExtractor", "--output-format", "parquet", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--split"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--partition", "1"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--output-format", "parquet"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "WebGraphExtractor", "--output-format", "parquet", "--partition", "1") + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "DomainFrequencyExtractor" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "DomainFrequencyExtractor", + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "DomainFrequencyExtractor", + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "DomainFrequencyExtractor", + "--output-format", + "parquet", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + domainGraphOpt + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + domainGraphOpt, + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + domainGraphOpt, + "--output-format", + "gexf" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + domainGraphOpt, + "--output-format", + "graphml" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + domainGraphOpt, + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + plainTextOpt + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + plainTextOpt, + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + plainTextOpt, + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + plainTextOpt, + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + plainTextOpt, + "--output-format", + "parquet", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + "--partition", + "1", + extractOpt, + plainTextOpt + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + imageGraphOpt + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + imageGraphOpt, + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + imageGraphOpt, + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + imageGraphOpt, + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + imageGraphOpt, + "--output-format", + "parquet", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + webPagesOpt + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + webPagesOpt, + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + webPagesOpt, + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + webPagesOpt, + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + webPagesOpt, + "--output-format", + "parquet", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "AudioInformationExtractor" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "AudioInformationExtractor", + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "AudioInformationExtractor", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "AudioInformationExtractor", + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "AudioInformationExtractor", + "--output-format", + "parquet", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "ImageInformationExtractor" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "ImageInformationExtractor", + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "ImageInformationExtractor", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "ImageInformationExtractor", + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "ImageInformationExtractor", + "--output-format", + "parquet", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "PDFInformationExtractor" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "PDFInformationExtractor", + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "PDFInformationExtractor", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "PDFInformationExtractor", + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "PDFInformationExtractor", + "--output-format", + "parquet", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "PresentationProgramInformationExtractor" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "PresentationProgramInformationExtractor", + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "PresentationProgramInformationExtractor", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "PresentationProgramInformationExtractor", + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "PresentationProgramInformationExtractor", + "--output-format", + "parquet", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "SpreadsheetInformationExtractor" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "SpreadsheetInformationExtractor", + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "SpreadsheetInformationExtractor", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "SpreadsheetInformationExtractor", + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "SpreadsheetInformationExtractor", + "--output-format", + "parquet", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "VideoInformationExtractor" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "VideoInformationExtractor", + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "VideoInformationExtractor", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "VideoInformationExtractor", + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "VideoInformationExtractor", + "--output-format", + "parquet", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "WordProcessorInformationExtractor" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "WordProcessorInformationExtractor", + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "WordProcessorInformationExtractor", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "WordProcessorInformationExtractor", + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "WordProcessorInformationExtractor", + "--output-format", + "parquet", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "WebGraphExtractor" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "WebGraphExtractor", + "--split" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "WebGraphExtractor", + "--partition", + "1" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "WebGraphExtractor", + "--output-format", + "parquet" + ), + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "WebGraphExtractor", + "--output-format", + "parquet", + "--partition", + "1" + ) ) private val testFailCmds = Array( Array(inputOpt, "_abracadabra", outputOpt, outputDir), Array(outputOpt, outputDir), Array(inputOpt, "_abracadabra"), - Array(inputOpt, arcPath, warcPath, outputOpt, outputDir, extractOpt, "abracadabra") + Array( + inputOpt, + arcPath, + warcPath, + outputOpt, + outputDir, + extractOpt, + "abracadabra" + ) ) before { @@ -125,19 +769,19 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter { } test("Command line app functionality tests") { - for {a <- testSuccessCmds} { + for { a <- testSuccessCmds } { app.CommandLineAppRunner.test(a, sc) assert(Files.exists(Paths.get(outputDir))) FileUtils.deleteDirectory(new File(outputDir)) } - for {a <- testFailCmds} { + for { a <- testFailCmds } { try { app.CommandLineAppRunner.test(a, sc) assert(false) } catch { case e: IllegalArgumentException => assert(true) - case _: Throwable => assert(false) + case _: Throwable => assert(false) } finally { assert(!Files.exists(Paths.get(outputDir))) } diff --git a/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesDFTest.scala b/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesDFTest.scala index cc75e31b..e40ed89b 100644 --- a/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesDFTest.scala +++ b/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesDFTest.scala @@ -33,8 +33,8 @@ class ExtractPopularImagesDFTest extends FunSuite with BeforeAndAfter { before { val conf = new SparkConf() - .setMaster(master) - .setAppName(appName) + .setMaster(master) + .setAppName(appName) conf.set("spark.driver.allowMultipleContexts", "true"); sc = new SparkContext(conf) } @@ -45,8 +45,8 @@ class ExtractPopularImagesDFTest extends FunSuite with BeforeAndAfter { val imagesLowLimit = ExtractPopularImagesDF(exampledf, 3) val imagesHighLimit = ExtractPopularImagesDF(exampledf, highTest) val response = "1" - assert (imagesLowLimit.take(1)(0)(1).toString == response) - assert (imagesHighLimit.take(1)(0)(1).toString == response) + assert(imagesLowLimit.take(1)(0)(1).toString == response) + assert(imagesHighLimit.take(1)(0)(1).toString == response) } after { if (sc != null) { diff --git a/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala b/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala index 51d58c93..c9a1db77 100644 --- a/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala +++ b/src/test/scala/io/archivesunleashed/app/ExtractPopularImagesTest.scala @@ -33,8 +33,8 @@ class ExtractPopularImagesTest extends FunSuite with BeforeAndAfter { before { val conf = new SparkConf() - .setMaster(master) - .setAppName(appName) + .setMaster(master) + .setAppName(appName) conf.set("spark.driver.allowMultipleContexts", "true"); sc = new SparkContext(conf) } @@ -44,11 +44,13 @@ class ExtractPopularImagesTest extends FunSuite with BeforeAndAfter { val examplerdd = RecordLoader.loadArchives(arcPath, sc) val imagesLowLimit = ExtractPopularImages(examplerdd, 3, sc) val imagesHighLimit = ExtractPopularImages(examplerdd, highTest, sc) - val response = Array("1\thttp://www.archive.org/images/books-small.jpg", + val response = Array( + "1\thttp://www.archive.org/images/books-small.jpg", "1\thttp://i.creativecommons.org/l/by-sa/3.0/88x31.png", - "1\thttp://www.archive.org/images/blendbar.jpg") - assert (imagesLowLimit.take(3).deep == response.deep) - assert (imagesHighLimit.take(3).deep == response.deep) + "1\thttp://www.archive.org/images/blendbar.jpg" + ) + assert(imagesLowLimit.take(3).deep == response.deep) + assert(imagesHighLimit.take(3).deep == response.deep) } after { if (sc != null) { diff --git a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala index 9ef66f7b..ab88280c 100644 --- a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala @@ -25,7 +25,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter { - private val arcPath = Resources.getResource("warc/example.pdf.warc.gz").getPath + private val arcPath = + Resources.getResource("warc/example.pdf.warc.gz").getPath private var sc: SparkContext = _ private val master = "local[4]" private val appName = "example-spark" @@ -44,7 +45,11 @@ class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter { val RESULTSLENGTH = 2 assert(dfResults.length == RESULTSLENGTH) - assert(dfResults(0).get(0) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y") + assert( + dfResults(0).get( + 0 + ) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y" + ) assert(dfResults(0).get(1) == "cost-analysis.pdf") assert(dfResults(0).get(2) == "pdf") assert(dfResults(0).get(3) == "application/pdf") diff --git a/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala index 4622ad76..4bd75644 100644 --- a/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/PlainTextExtractorTest.scala @@ -45,12 +45,20 @@ class PlainTextExtractorTest extends FunSuite with BeforeAndAfter { assert(dfResults.length == RESULTSLENGTH) assert(dfResults(0).get(0) == "") - assert(dfResults(4).get(0) - .toString - .startsWith("Author: Spivak, John L. (John Louis), b. 1897 Published: 1939")) - assert(dfResults(50).get(0) - .toString - .startsWith("How many hours in a day They tell me 24 ")) + assert( + dfResults(4) + .get(0) + .toString + .startsWith( + "Author: Spivak, John L. (John Louis), b. 1897 Published: 1939" + ) + ) + assert( + dfResults(50) + .get(0) + .toString + .startsWith("How many hours in a day They tell me 24 ") + ) } after { diff --git a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala index 8be73740..6bdfee35 100644 --- a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala @@ -24,8 +24,11 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) -class PresentationProgramInformationExtractorTest extends FunSuite with BeforeAndAfter { - private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath +class PresentationProgramInformationExtractorTest + extends FunSuite + with BeforeAndAfter { + private val arcPath = + Resources.getResource("warc/example.docs.warc.gz").getPath private var sc: SparkContext = _ private val master = "local[4]" private val appName = "example-spark" @@ -44,11 +47,23 @@ class PresentationProgramInformationExtractorTest extends FunSuite with BeforeAn val RESULTSLENGTH = 2 assert(dfResults.length == RESULTSLENGTH) - assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx") + assert( + dfResults(0).get( + 0 + ) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx" + ) assert(dfResults(0).get(1) == "aut-test-fixtures.pptx") assert(dfResults(0).get(2) == "pptx") - assert(dfResults(0).get(3) == "application/vnd.openxmlformats-officedocument.presentationml.presentation") - assert(dfResults(0).get(4) == "application/vnd.openxmlformats-officedocument.presentationml.presentation") + assert( + dfResults(0).get( + 3 + ) == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert( + dfResults(0).get( + 4 + ) == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) assert(dfResults(0).get(5) == "7a7b1fe4b6d311376eaced9de3b682ee") assert(dfResults(0).get(6) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec") } diff --git a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala index 0754812f..79b8c781 100644 --- a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala @@ -25,7 +25,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter { - private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath + private val arcPath = + Resources.getResource("warc/example.docs.warc.gz").getPath private var sc: SparkContext = _ private val master = "local[4]" private val appName = "example-spark" @@ -44,11 +45,19 @@ class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter { val RESULTSLENGTH = 4 assert(dfResults.length == RESULTSLENGTH) - assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods") + assert( + dfResults(0).get( + 0 + ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods" + ) assert(dfResults(0).get(1) == "test-aut-fixture.ods") assert(dfResults(0).get(2) == "ods") - assert(dfResults(0).get(3) == "application/vnd.oasis.opendocument.spreadsheet") - assert(dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet") + assert( + dfResults(0).get(3) == "application/vnd.oasis.opendocument.spreadsheet" + ) + assert( + dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet" + ) assert(dfResults(0).get(5) == "7f70280757d8beb2d1bfd6fb1b6ae6e9") assert(dfResults(0).get(6) == "448c357e78317877a98a399448031a89f1dda6fb") } diff --git a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala index 1d4cec03..ac525428 100644 --- a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala @@ -25,7 +25,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter { - private val arcPath = Resources.getResource("warc/example.media.warc.gz").getPath + private val arcPath = + Resources.getResource("warc/example.media.warc.gz").getPath private var sc: SparkContext = _ private val master = "local[4]" private val appName = "example-spark" @@ -44,7 +45,9 @@ class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter { val RESULTSLENGTH = 1 assert(dfResults.length == RESULTSLENGTH) - assert(dfResults(0).get(0) == "https://ruebot.net/2018-11-12%2016.14.11.mp4") + assert( + dfResults(0).get(0) == "https://ruebot.net/2018-11-12%2016.14.11.mp4" + ) assert(dfResults(0).get(1) == "2018-11-12%2016.14.11.mp4") assert(dfResults(0).get(2) == "mp4") assert(dfResults(0).get(3) == "video/mp4") diff --git a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala index 2668d713..8ea033d3 100644 --- a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala @@ -24,8 +24,11 @@ import org.scalatest.junit.JUnitRunner import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) -class WordProcessorInformationExtractorTest extends FunSuite with BeforeAndAfter { - private val arcPath = Resources.getResource("warc/example.docs.warc.gz").getPath +class WordProcessorInformationExtractorTest + extends FunSuite + with BeforeAndAfter { + private val arcPath = + Resources.getResource("warc/example.docs.warc.gz").getPath private var sc: SparkContext = _ private val master = "local[4]" private val appName = "example-spark" @@ -44,7 +47,11 @@ class WordProcessorInformationExtractorTest extends FunSuite with BeforeAndAfter val RESULTSLENGTH = 3 assert(dfResults.length == RESULTSLENGTH) - assert(dfResults(0).get(0) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf") + assert( + dfResults(0).get( + 0 + ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf" + ) assert(dfResults(0).get(1) == "test-aut-fixtures.rtf") assert(dfResults(0).get(2) == "rtf") assert(dfResults(0).get(3) == "application/rtf") diff --git a/src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala b/src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala index 41a94c8c..d0812384 100644 --- a/src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala +++ b/src/test/scala/io/archivesunleashed/app/WriteGEXFTest.scala @@ -25,44 +25,56 @@ import org.scalatest.{BeforeAndAfter, FunSuite} import scala.io.Source @RunWith(classOf[JUnitRunner]) -class WriteGEXFTest extends FunSuite with BeforeAndAfter{ +class WriteGEXFTest extends FunSuite with BeforeAndAfter { private var sc: SparkContext = _ private val master = "local[4]" private val appName = "example-spark" - private val network = Seq(("Date1", "Source1", "Destination1", 3), - ("Date2", "Source2", "Destination2", 4), - ("Date3", "Source3", "Destination3", 100)) + private val network = Seq( + ("Date1", "Source1", "Destination1", 3), + ("Date2", "Source2", "Destination2", 4), + ("Date3", "Source3", "Destination3", 100) + ) private val testFile = "temporaryTestFile.gexf" before { val conf = new SparkConf() .setMaster(master) .setAppName(appName) - conf.set("spark.driver.allowMultipleContexts", "true"); - sc = new SparkContext(conf) - } + conf.set("spark.driver.allowMultipleContexts", "true"); + sc = new SparkContext(conf) + } test("Creates the GEXF file from Array[Row]") { val testLines = (0, 12, 22, 34) if (Files.exists(Paths.get(testFile))) { new File(testFile).delete() } - val networkarray = Array(Row.fromTuple(network(0)), - Row.fromTuple(network(1)), Row.fromTuple(network(2))) + val networkarray = Array( + Row.fromTuple(network(0)), + Row.fromTuple(network(1)), + Row.fromTuple(network(2)) + ) val ret = WriteGEXF(networkarray, testFile) assert(ret) val lines = Source.fromFile(testFile).getLines.toList assert(lines(testLines._1) == """""") - assert(lines(testLines._2) == """""") + assert( + lines( + testLines._2 + ) == """""" + ) assert(lines(testLines._3) == """""") assert(lines(testLines._4) == """""") - assert(!WriteGEXF(networkarray ,"")) + assert(!WriteGEXF(networkarray, "")) } test("Test if GEXF path is empty") { val networkGraph = sc.parallelize(network) - val networkarray = Array(Row.fromTuple(network(0)), - Row.fromTuple(network(1)), Row.fromTuple(network(2))) + val networkarray = Array( + Row.fromTuple(network(0)), + Row.fromTuple(network(1)), + Row.fromTuple(network(2)) + ) val gexf = WriteGEXF(networkarray, testFile) assert(gexf) assert(!WriteGEXF(networkarray, "")) diff --git a/src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala b/src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala index 7f1eb73a..ed264490 100644 --- a/src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala +++ b/src/test/scala/io/archivesunleashed/app/WriteGraphMLTest.scala @@ -25,33 +25,38 @@ import org.scalatest.{BeforeAndAfter, FunSuite} import scala.io.Source @RunWith(classOf[JUnitRunner]) -class WriteGraphMLTest extends FunSuite with BeforeAndAfter{ +class WriteGraphMLTest extends FunSuite with BeforeAndAfter { private var sc: SparkContext = _ private val master = "local[4]" private val appName = "example-spark" private val linkCountOne = 3 private val linkCountTwo = 4 private val linkCountThree = 100 - private val network = Seq(("Date1", "Source1", "Destination1", linkCountOne), - ("Date2", "Source2", "Destination2", linkCountTwo), - ("Date3", "Source3", "Destination3", linkCountThree)) + private val network = Seq( + ("Date1", "Source1", "Destination1", linkCountOne), + ("Date2", "Source2", "Destination2", linkCountTwo), + ("Date3", "Source3", "Destination3", linkCountThree) + ) private val testFile = "temporaryTestFile.graphml" before { val conf = new SparkConf() .setMaster(master) .setAppName(appName) - conf.set("spark.driver.allowMultipleContexts", "true"); - sc = new SparkContext(conf) - } + conf.set("spark.driver.allowMultipleContexts", "true"); + sc = new SparkContext(conf) + } test("Create WriteGraphML file from Array[Row]") { val lineCheck = (0, 15, 22, 30) if (Files.exists(Paths.get(testFile))) { new File(testFile).delete() } - val networkarray = Array(Row.fromTuple(network(0)), - Row.fromTuple(network(1)), Row.fromTuple(network(2))) + val networkarray = Array( + Row.fromTuple(network(0)), + Row.fromTuple(network(1)), + Row.fromTuple(network(2)) + ) val ret = WriteGraphML(networkarray, testFile) assert(ret) assert(Files.exists(Paths.get(testFile))) @@ -62,9 +67,12 @@ class WriteGraphMLTest extends FunSuite with BeforeAndAfter{ assert(lines(lineCheck._4) == """3""") } - test ("Test if GraphML path is empty") { - val networkarray = Array(Row.fromTuple(network(0)), - Row.fromTuple(network(1)), Row.fromTuple(network(2))) + test("Test if GraphML path is empty") { + val networkarray = Array( + Row.fromTuple(network(0)), + Row.fromTuple(network(1)), + Row.fromTuple(network(2)) + ) val graphml = WriteGraphML(networkarray, testFile) assert(graphml) assert(!WriteGraphML(networkarray, "")) diff --git a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala index 1b74d973..9e5cd7fc 100644 --- a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala +++ b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala @@ -25,10 +25,14 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { private val arcPath = Resources.getResource("arc/example.arc.gz").getPath - private val mediaPath = Resources.getResource("warc/example.media.warc.gz").getPath - private val docPath = Resources.getResource("warc/example.docs.warc.gz").getPath - private val txtPath = Resources.getResource("warc/example.txt.warc.gz").getPath - private val pdfPath = Resources.getResource("warc/example.pdf.warc.gz").getPath + private val mediaPath = + Resources.getResource("warc/example.media.warc.gz").getPath + private val docPath = + Resources.getResource("warc/example.docs.warc.gz").getPath + private val txtPath = + Resources.getResource("warc/example.txt.warc.gz").getPath + private val pdfPath = + Resources.getResource("warc/example.pdf.warc.gz").getPath private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ @@ -67,7 +71,9 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { val r_3 = imagegraph.take(100)(99) assert(r_3.get(0) == "20080430") - assert(r_3.get(1) == "http://www.archive.org/details/secretarmiesb00spivrich") + assert( + r_3.get(1) == "http://www.archive.org/details/secretarmiesb00spivrich" + ) assert(r_3.get(2) == "http://www.archive.org/images/star.png") val r_4 = images.take(1)(0) @@ -75,7 +81,11 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { assert(r_4.getAs[String](md5) == "8211d1fbb9b03d8522a1ae378f9d1b24") val r_5 = pdfs.take(1)(0) - assert(r_5.getAs[String](url) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y") + assert( + r_5.getAs[String]( + url + ) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y" + ) assert(r_5.getAs[String](md5) == "aaba59d2287afd40c996488a39bbc0dd") val r_6 = audio.take(1)(0) @@ -83,19 +93,33 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter { assert(r_6.getAs[String](md5) == "f7e7ec84b12c294e19af1ba41732c733") val r_7 = video.take(1)(0) - assert(r_7.getAs[String](url) == "https://ruebot.net/2018-11-12%2016.14.11.mp4") + assert( + r_7.getAs[String](url) == "https://ruebot.net/2018-11-12%2016.14.11.mp4" + ) assert(r_7.getAs[String](md5) == "2cde7de3213a87269957033f6315fce2") val r_8 = spreadsheets.take(1)(0) - assert(r_8.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods") + assert( + r_8.getAs[String]( + url + ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods" + ) assert(r_8.getAs[String](md5) == "7f70280757d8beb2d1bfd6fb1b6ae6e9") val r_9 = powerpoint.take(1)(0) - assert(r_9.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx") + assert( + r_9.getAs[String]( + url + ) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx" + ) assert(r_9.getAs[String](md5) == "7a7b1fe4b6d311376eaced9de3b682ee") val r_10 = word.take(1)(0) - assert(r_10.getAs[String](url) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf") + assert( + r_10.getAs[String]( + url + ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf" + ) assert(r_10.getAs[String](md5) == "e483512b65ba44d71e843c57de2adeb7") val r_11 = all.select(url, mime_type).take(1)(0) diff --git a/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala index beca222b..faef838a 100644 --- a/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala +++ b/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala @@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class AudioTest extends FunSuite with BeforeAndAfter { - private val warcPath = Resources.getResource("warc/example.media.warc.gz").getPath + private val warcPath = + Resources.getResource("warc/example.media.warc.gz").getPath private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ @@ -39,12 +40,22 @@ class AudioTest extends FunSuite with BeforeAndAfter { } test("Audio files extraction DF") { - val df = RecordLoader.loadArchives(warcPath, sc) + val df = RecordLoader + .loadArchives(warcPath, sc) .audio() - val extracted = df.select("url", "filename", "extension", - "mime_type_web_server", "mime_type_tika", "md5") - .orderBy(desc("md5")).head(1).toList + val extracted = df + .select( + "url", + "filename", + "extension", + "mime_type_web_server", + "mime_type_tika", + "md5" + ) + .orderBy(desc("md5")) + .head(1) + .toList assert(extracted.size == 1) assert("https://ruebot.net/files/feniz.mp3" == extracted(0)(0)) assert("feniz.mp3" == extracted(0)(1)) diff --git a/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala index cd5580ea..b6a41f88 100644 --- a/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala +++ b/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala @@ -17,7 +17,12 @@ package io.archivesunleashed import com.google.common.io.Resources -import io.archivesunleashed.udfs.{extractDate, extractDomain, extractLinks, removePrefixWWW} +import io.archivesunleashed.udfs.{ + extractDate, + extractDomain, + extractLinks, + removePrefixWWW +} import org.apache.spark.sql.functions.{array, explode_outer, lower, udf} import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} @@ -40,7 +45,8 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { } test("Extract dates YYYY DF") { - val df = RecordLoader.loadArchives(arcPath, sc) + val df = RecordLoader + .loadArchives(arcPath, sc) .webpages() val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1)) @@ -52,14 +58,25 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { import org.apache.spark.sql.functions._ // scalastyle:on - val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"), - $"url".as("url"), - extractDate($"crawl_date",lit("YYYY")).as("crawl_date"), - explode_outer(extractLinks($"url", $"content")).as("link") - ) - .filter(lower($"content").contains("keynote")) // filtered on keyword internet - - val results = interResults.select($"url", $"Domain", $"crawl_date", dest(array($"link")).as("destination_page")).head(3) + val interResults = df + .select( + removePrefixWWW(extractDomain($"url")).as("Domain"), + $"url".as("url"), + extractDate($"crawl_date", lit("YYYY")).as("crawl_date"), + explode_outer(extractLinks($"url", $"content")).as("link") + ) + .filter( + lower($"content").contains("keynote") + ) // filtered on keyword internet + + val results = interResults + .select( + $"url", + $"Domain", + $"crawl_date", + dest(array($"link")).as("destination_page") + ) + .head(3) assert(results(0).get(0) == "http://www.archive.org/index.php") assert(results(0).get(1) == "archive.org") @@ -69,7 +86,11 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { assert(results(1).get(0) == "http://www.archive.org/index.php") assert(results(1).get(1) == "archive.org") assert(results(1).get(2) == "2008") - assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html") + assert( + results(1).get( + 3 + ) == "http://web.archive.org/collections/web/advanced.html" + ) assert(results(2).get(0) == "http://www.archive.org/index.php") assert(results(2).get(1) == "archive.org") @@ -78,7 +99,8 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { } test("Extract dates YYYYMM DF") { - val df = RecordLoader.loadArchives(arcPath, sc) + val df = RecordLoader + .loadArchives(arcPath, sc) .webpages() val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1)) @@ -90,14 +112,25 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { import org.apache.spark.sql.functions._ // scalastyle:on - val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"), - $"url".as("url"), - extractDate($"crawl_date",lit("YYYYMM")).as("crawl_date"), - explode_outer(extractLinks($"url", $"content")).as("link") - ) - .filter(lower($"content").contains("keynote")) // filtered on keyword internet - - val results = interResults.select($"url", $"Domain", $"crawl_date", dest(array($"link")).as("destination_page")).head(3) + val interResults = df + .select( + removePrefixWWW(extractDomain($"url")).as("Domain"), + $"url".as("url"), + extractDate($"crawl_date", lit("YYYYMM")).as("crawl_date"), + explode_outer(extractLinks($"url", $"content")).as("link") + ) + .filter( + lower($"content").contains("keynote") + ) // filtered on keyword internet + + val results = interResults + .select( + $"url", + $"Domain", + $"crawl_date", + dest(array($"link")).as("destination_page") + ) + .head(3) assert(results(0).get(0) == "http://www.archive.org/index.php") assert(results(0).get(1) == "archive.org") @@ -107,7 +140,11 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { assert(results(1).get(0) == "http://www.archive.org/index.php") assert(results(1).get(1) == "archive.org") assert(results(1).get(2) == "200804") - assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html") + assert( + results(1).get( + 3 + ) == "http://web.archive.org/collections/web/advanced.html" + ) assert(results(2).get(0) == "http://www.archive.org/index.php") assert(results(2).get(1) == "archive.org") @@ -116,7 +153,8 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { } test("Extract dates MM DF") { - val df = RecordLoader.loadArchives(arcPath, sc) + val df = RecordLoader + .loadArchives(arcPath, sc) .webpages() val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1)) @@ -128,14 +166,25 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { import org.apache.spark.sql.functions._ // scalastyle:on - val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"), - $"url".as("url"), - extractDate($"crawl_date",lit("MM")).as("crawl_date"), - explode_outer(extractLinks($"url", $"content")).as("link") - ) - .filter(lower($"content").contains("keynote")) // filtered on keyword internet - - val results = interResults.select($"url", $"Domain", $"crawl_date", dest(array($"link")).as("destination_page")).head(3) + val interResults = df + .select( + removePrefixWWW(extractDomain($"url")).as("Domain"), + $"url".as("url"), + extractDate($"crawl_date", lit("MM")).as("crawl_date"), + explode_outer(extractLinks($"url", $"content")).as("link") + ) + .filter( + lower($"content").contains("keynote") + ) // filtered on keyword internet + + val results = interResults + .select( + $"url", + $"Domain", + $"crawl_date", + dest(array($"link")).as("destination_page") + ) + .head(3) assert(results(0).get(0) == "http://www.archive.org/index.php") assert(results(0).get(1) == "archive.org") @@ -145,7 +194,11 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { assert(results(1).get(0) == "http://www.archive.org/index.php") assert(results(1).get(1) == "archive.org") assert(results(1).get(2) == "04") - assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html") + assert( + results(1).get( + 3 + ) == "http://web.archive.org/collections/web/advanced.html" + ) assert(results(2).get(0) == "http://www.archive.org/index.php") assert(results(2).get(1) == "archive.org") @@ -154,7 +207,8 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { } test("Extract dates DD DF") { - val df = RecordLoader.loadArchives(arcPath, sc) + val df = RecordLoader + .loadArchives(arcPath, sc) .webpages() val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1)) @@ -166,14 +220,25 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { import org.apache.spark.sql.functions._ // scalastyle:on - val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"), - $"url".as("url"), - extractDate($"crawl_date",lit("DD")).as("crawl_date"), - explode_outer(extractLinks($"url", $"content")).as("link") - ) - .filter(lower($"content").contains("keynote")) // filtered on keyword internet - - val results = interResults.select($"url", $"Domain", $"crawl_date", dest(array($"link")).as("destination_page")).head(3) + val interResults = df + .select( + removePrefixWWW(extractDomain($"url")).as("Domain"), + $"url".as("url"), + extractDate($"crawl_date", lit("DD")).as("crawl_date"), + explode_outer(extractLinks($"url", $"content")).as("link") + ) + .filter( + lower($"content").contains("keynote") + ) // filtered on keyword internet + + val results = interResults + .select( + $"url", + $"Domain", + $"crawl_date", + dest(array($"link")).as("destination_page") + ) + .head(3) assert(results(0).get(0) == "http://www.archive.org/index.php") assert(results(0).get(1) == "archive.org") @@ -183,7 +248,11 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { assert(results(1).get(0) == "http://www.archive.org/index.php") assert(results(1).get(1) == "archive.org") assert(results(1).get(2) == "30") - assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html") + assert( + results(1).get( + 3 + ) == "http://web.archive.org/collections/web/advanced.html" + ) assert(results(2).get(0) == "http://www.archive.org/index.php") assert(results(2).get(1) == "archive.org") @@ -192,7 +261,8 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { } test("Extract dates YYYYMMDD DF") { - val df = RecordLoader.loadArchives(arcPath, sc) + val df = RecordLoader + .loadArchives(arcPath, sc) .webpages() val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1)) @@ -204,14 +274,25 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { import org.apache.spark.sql.functions._ // scalastyle:on - val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"), - $"url".as("url"), - extractDate($"crawl_date",lit("YYYYMMDD")).as("crawl_date"), - explode_outer(extractLinks($"url", $"content")).as("link") - ) - .filter(lower($"content").contains("keynote")) // filtered on keyword internet - - val results = interResults.select($"url", $"Domain", $"crawl_date", dest(array($"link")).as("destination_page")).head(3) + val interResults = df + .select( + removePrefixWWW(extractDomain($"url")).as("Domain"), + $"url".as("url"), + extractDate($"crawl_date", lit("YYYYMMDD")).as("crawl_date"), + explode_outer(extractLinks($"url", $"content")).as("link") + ) + .filter( + lower($"content").contains("keynote") + ) // filtered on keyword internet + + val results = interResults + .select( + $"url", + $"Domain", + $"crawl_date", + dest(array($"link")).as("destination_page") + ) + .head(3) assert(results(0).get(0) == "http://www.archive.org/index.php") assert(results(0).get(1) == "archive.org") @@ -221,7 +302,11 @@ class ExtractDateDFTest extends FunSuite with BeforeAndAfter { assert(results(1).get(0) == "http://www.archive.org/index.php") assert(results(1).get(1) == "archive.org") assert(results(1).get(2) == "20080430") - assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html") + assert( + results(1).get( + 3 + ) == "http://web.archive.org/collections/web/advanced.html" + ) assert(results(2).get(0) == "http://www.archive.org/index.php") assert(results(2).get(1) == "archive.org") diff --git a/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala index 9bf45a60..f6b16aa1 100644 --- a/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala +++ b/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala @@ -40,7 +40,8 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter { } test("Extract links DF") { - val df = RecordLoader.loadArchives(arcPath, sc) + val df = RecordLoader + .loadArchives(arcPath, sc) .webpages() val dest = udf((vs: Seq[Any]) => vs(0).toString.split(",")(1)) @@ -51,14 +52,25 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter { import spark.implicits._ // scalastyle:on - val interResults = df.select(removePrefixWWW(extractDomain($"url")).as("Domain"), - $"url".as("url"), - $"crawl_date", - explode_outer(extractLinks($"url",$"content")).as("link") - ) - .filter(lower($"content").contains("keynote")) // filtered on keyword internet + val interResults = df + .select( + removePrefixWWW(extractDomain($"url")).as("Domain"), + $"url".as("url"), + $"crawl_date", + explode_outer(extractLinks($"url", $"content")).as("link") + ) + .filter( + lower($"content").contains("keynote") + ) // filtered on keyword internet - val results = interResults.select($"url",$"Domain",$"crawl_date",dest(array($"link")).as("destination_page")).head(3) + val results = interResults + .select( + $"url", + $"Domain", + $"crawl_date", + dest(array($"link")).as("destination_page") + ) + .head(3) // Results should be: // +--------------------------------+-----------+----------+----------------------------------------------------+ @@ -69,7 +81,6 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter { // |http://www.archive.org/index.php|archive.org|20080430 |http://www.sloan.org | // +--------------------------------+-----------+----------+----------------------------------------------------+ - assert(results(0).get(0) == "http://www.archive.org/index.php") assert(results(0).get(1) == "archive.org") assert(results(0).get(2) == "20080430") @@ -78,7 +89,11 @@ class ExtractHyperlinksTest extends FunSuite with BeforeAndAfter { assert(results(1).get(0) == "http://www.archive.org/index.php") assert(results(1).get(1) == "archive.org") assert(results(1).get(2) == "20080430") - assert(results(1).get(3) == "http://web.archive.org/collections/web/advanced.html") + assert( + results(1).get( + 3 + ) == "http://web.archive.org/collections/web/advanced.html" + ) assert(results(2).get(0) == "http://www.archive.org/index.php") assert(results(2).get(1) == "archive.org") diff --git a/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala index 843635ae..523ea78f 100644 --- a/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala +++ b/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala @@ -39,14 +39,27 @@ class ExtractImageDetailsTest extends FunSuite with BeforeAndAfter { } test("Image files extraction DF") { - val df = RecordLoader.loadArchives(arcPath, sc) + val df = RecordLoader + .loadArchives(arcPath, sc) .images() - val extracted = df.select("url", "mime_type_web_server", "mime_type_tika", - "width", "height", "md5", "sha1") - .orderBy(desc("md5")).head(2).toList + val extracted = df + .select( + "url", + "mime_type_web_server", + "mime_type_tika", + "width", + "height", + "md5", + "sha1" + ) + .orderBy(desc("md5")) + .head(2) + .toList assert(extracted.size == 2) - assert("http://www.archive.org/images/mediatype_movies.gif" == extracted(0)(0)) + assert( + "http://www.archive.org/images/mediatype_movies.gif" == extracted(0)(0) + ) assert("image/gif" == extracted(0)(1)) assert("image/gif" == extracted(0)(2)) assert(21 == extracted(0)(3)) diff --git a/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala index 4fa200b2..8e199ce2 100644 --- a/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala +++ b/src/test/scala/io/archivesunleashed/df/ExtractImageLinksTest.scala @@ -39,7 +39,8 @@ class ImageLinksTest extends FunSuite with BeforeAndAfter { } test("Image links extraction DF") { - val df = RecordLoader.loadArchives(arcPath, sc) + val df = RecordLoader + .loadArchives(arcPath, sc) .imagegraph() // We need this in order to use the $-notation @@ -47,13 +48,24 @@ class ImageLinksTest extends FunSuite with BeforeAndAfter { // scalastyle:off import spark.implicits._ // scalastyle:on - val extracted = df.select($"src".as("Domain"), $"image_url".as("Image")) - .orderBy(desc("Image")).head(2).toList + val extracted = df + .select($"src".as("Domain"), $"image_url".as("Image")) + .orderBy(desc("Image")) + .head(2) + .toList assert(extracted.size == 2) assert("http://www.archive.org/index.php" == extracted(0)(0)) - assert("http://www.archive.org/services/get-item-image.php?identifier=zh27814&collection=zh27&mediatype=audio" == extracted(0)(1)) + assert( + "http://www.archive.org/services/get-item-image.php?identifier=zh27814&collection=zh27&mediatype=audio" == extracted( + 0 + )(1) + ) assert("http://www.archive.org/index.php" == extracted(1)(0)) - assert("http://www.archive.org/services/get-item-image.php?identifier=secretarmiesb00spivrich&collection=americana&mediatype=texts" == extracted(1)(1)) + assert( + "http://www.archive.org/services/get-item-image.php?identifier=secretarmiesb00spivrich&collection=americana&mediatype=texts" == extracted( + 1 + )(1) + ) } after { diff --git a/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala index b17fb32a..5ffdf6d6 100644 --- a/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala +++ b/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala @@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class ExtractPDFDetailsTest extends FunSuite with BeforeAndAfter { - private val warcPath = Resources.getResource("warc/example.pdf.warc.gz").getPath + private val warcPath = + Resources.getResource("warc/example.pdf.warc.gz").getPath private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ @@ -39,21 +40,41 @@ class ExtractPDFDetailsTest extends FunSuite with BeforeAndAfter { } test("PDF files extraction DF") { - val df = RecordLoader.loadArchives(warcPath, sc) + val df = RecordLoader + .loadArchives(warcPath, sc) .pdfs() - val extracted = df.select("url", "filename", "extension", - "mime_type_web_server", "mime_type_tika", "md5") - .orderBy(desc("md5")).head(2).toList + val extracted = df + .select( + "url", + "filename", + "extension", + "mime_type_web_server", + "mime_type_tika", + "md5" + ) + .orderBy(desc("md5")) + .head(2) + .toList assert(extracted.size == 2) - assert("https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y" == extracted(0)(0)) + assert( + "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y" == extracted( + 0 + )(0) + ) assert("cost-analysis.pdf" == extracted(0)(1)) assert("pdf" == extracted(0)(2)) assert("application/pdf" == extracted(0)(3)) assert("application/pdf" == extracted(0)(4)) assert("aaba59d2287afd40c996488a39bbc0dd" == extracted(0)(5)) - assert("https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf?sequence=3&isAllowed=y" == extracted(1)(0)) - assert("JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf" == extracted(1)(1)) + assert( + "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf?sequence=3&isAllowed=y" == extracted( + 1 + )(0) + ) + assert( + "JCDL%20-%20Cost%20of%20a%20WARC%20Presentation-4.pdf" == extracted(1)(1) + ) assert("pdf" == extracted(1)(2)) assert("application/pdf" == extracted(1)(3)) assert("application/pdf" == extracted(1)(4)) diff --git a/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala index 4168e5a6..c09d432f 100644 --- a/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala +++ b/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala @@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class PresentationProgramFilesTest extends FunSuite with BeforeAndAfter { - private val warcPath = Resources.getResource("warc/example.docs.warc.gz").getPath + private val warcPath = + Resources.getResource("warc/example.docs.warc.gz").getPath private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ @@ -39,24 +40,50 @@ class PresentationProgramFilesTest extends FunSuite with BeforeAndAfter { } test("Presentation program files extraction DF") { - val df = RecordLoader.loadArchives(warcPath, sc) + val df = RecordLoader + .loadArchives(warcPath, sc) .presentationProgramFiles() - val extracted = df.select("url", "filename", "extension", - "mime_type_web_server", "mime_type_tika", "md5") - .orderBy(desc("md5")).head(2).toList + val extracted = df + .select( + "url", + "filename", + "extension", + "mime_type_web_server", + "mime_type_tika", + "md5" + ) + .orderBy(desc("md5")) + .head(2) + .toList assert(extracted.size == 2) - assert("https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.odp" == extracted(0)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.odp" == extracted( + 0 + )(0) + ) assert("aut-test-fixtures.odp" == extracted(0)(1)) assert("odp" == extracted(0)(2)) assert("application/vnd.oasis.opendocument.presentation" == extracted(0)(3)) assert("application/vnd.oasis.opendocument.presentation" == extracted(0)(4)) assert("f38b2679029cf3453c8151b92c615c70" == extracted(0)(5)) - assert("https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx" == extracted(1)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx" == extracted( + 1 + )(0) + ) assert("aut-test-fixtures.pptx" == extracted(1)(1)) assert("pptx" == extracted(1)(2)) - assert("application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted(1)(3)) - assert("application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted(1)(4)) + assert( + "application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted( + 1 + )(3) + ) + assert( + "application/vnd.openxmlformats-officedocument.presentationml.presentation" == extracted( + 1 + )(4) + ) assert("7a7b1fe4b6d311376eaced9de3b682ee" == extracted(1)(5)) } diff --git a/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala index c326b021..a36c3742 100644 --- a/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala +++ b/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala @@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class ExtractSpreadsheetDetailsTest extends FunSuite with BeforeAndAfter { - private val warcPath = Resources.getResource("warc/example.docs.warc.gz").getPath + private val warcPath = + Resources.getResource("warc/example.docs.warc.gz").getPath private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ @@ -39,32 +40,66 @@ class ExtractSpreadsheetDetailsTest extends FunSuite with BeforeAndAfter { } test("Spreadsheet files extraction DF") { - val df = RecordLoader.loadArchives(warcPath, sc) + val df = RecordLoader + .loadArchives(warcPath, sc) .spreadsheets() - val extracted = df.select("url", "filename", "extension", - "mime_type_web_server", "mime_type_tika", "md5") - .orderBy(desc("md5")).head(4).toList + val extracted = df + .select( + "url", + "filename", + "extension", + "mime_type_web_server", + "mime_type_tika", + "md5" + ) + .orderBy(desc("md5")) + .head(4) + .toList assert(extracted.size == 4) - assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.xlsx" == extracted(0)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.xlsx" == extracted( + 0 + )(0) + ) assert("test-aut-fixture.xlsx" == extracted(0)(1)) assert("xlsx" == extracted(0)(2)) - assert("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" == extracted(0)(3)) - assert("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" == extracted(0)(4)) + assert( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" == extracted( + 0 + )(3) + ) + assert( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" == extracted( + 0 + )(4) + ) assert("befb3304cb592e0761509bf626171071" == extracted(0)(5)) - assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture%20-%20Sheet1.tsv" == extracted(1)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture%20-%20Sheet1.tsv" == extracted( + 1 + )(0) + ) assert("test-aut-fixture%20-%20Sheet1.tsv" == extracted(1)(1)) assert("tsv" == extracted(1)(2)) assert("text/tab-separated-values" == extracted(1)(3)) assert("text/plain" == extracted(1)(4)) assert("8ce6e9489c1c1129cca0e3f1eb8206ce" == extracted(1)(5)) - assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods" == extracted(2)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods" == extracted( + 2 + )(0) + ) assert("test-aut-fixture.ods" == extracted(2)(1)) assert("ods" == extracted(2)(2)) assert("application/vnd.oasis.opendocument.spreadsheet" == extracted(2)(3)) assert("application/vnd.oasis.opendocument.spreadsheet" == extracted(2)(4)) assert("7f70280757d8beb2d1bfd6fb1b6ae6e9" == extracted(2)(5)) - assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixture%20-%20Sheet1.csv" == extracted(3)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture%20-%20Sheet1.csv" == extracted( + 3 + )(0) + ) assert("test-aut-fixture%20-%20Sheet1.csv" == extracted(3)(1)) assert("csv" == extracted(3)(2)) assert("text/csv" == extracted(3)(3)) diff --git a/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala index be1048b0..b0bf7abd 100644 --- a/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala +++ b/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala @@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class VideoTest extends FunSuite with BeforeAndAfter { - private val warcPath = Resources.getResource("warc/example.media.warc.gz").getPath + private val warcPath = + Resources.getResource("warc/example.media.warc.gz").getPath private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ @@ -39,12 +40,22 @@ class VideoTest extends FunSuite with BeforeAndAfter { } test("Video files extraction DF") { - val df = RecordLoader.loadArchives(warcPath, sc) + val df = RecordLoader + .loadArchives(warcPath, sc) .videos() - val extracted = df.select("url", "filename", "extension", - "mime_type_web_server", "mime_type_tika", "md5") - .orderBy(desc("md5")).head(1).toList + val extracted = df + .select( + "url", + "filename", + "extension", + "mime_type_web_server", + "mime_type_tika", + "md5" + ) + .orderBy(desc("md5")) + .head(1) + .toList assert(extracted.size == 1) assert("https://ruebot.net/2018-11-12%2016.14.11.mp4" == extracted(0)(0)) assert("2018-11-12%2016.14.11.mp4" == extracted(0)(1)) diff --git a/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala index a9d474de..bc3e9100 100644 --- a/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala +++ b/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala @@ -26,7 +26,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class WordProcessorFilesTest extends FunSuite with BeforeAndAfter { - private val warcPath = Resources.getResource("warc/example.docs.warc.gz").getPath + private val warcPath = + Resources.getResource("warc/example.docs.warc.gz").getPath private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ @@ -39,30 +40,60 @@ class WordProcessorFilesTest extends FunSuite with BeforeAndAfter { } test("Word processor files extraction DF") { - val df = RecordLoader.loadArchives(warcPath, sc) + val df = RecordLoader + .loadArchives(warcPath, sc) .wordProcessorFiles() - val extracted = df.select("url", "filename", "extension", - "mime_type_web_server", "mime_type_tika", "md5") - .orderBy(desc("md5")).head(3).toList + val extracted = df + .select( + "url", + "filename", + "extension", + "mime_type_web_server", + "mime_type_tika", + "md5" + ) + .orderBy(desc("md5")) + .head(3) + .toList assert(extracted.size == 3) - assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf" == extracted(0)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf" == extracted( + 0 + )(0) + ) assert("test-aut-fixtures.rtf" == extracted(0)(1)) assert("rtf" == extracted(0)(2)) assert("application/rtf" == extracted(0)(3)) assert("application/rtf" == extracted(0)(4)) assert("e483512b65ba44d71e843c57de2adeb7" == extracted(0)(5)) - assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.odt" == extracted(1)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.odt" == extracted( + 1 + )(0) + ) assert("test-aut-fixtures.odt" == extracted(1)(1)) assert("odt" == extracted(1)(2)) assert("application/vnd.oasis.opendocument.text" == extracted(1)(3)) assert("application/vnd.oasis.opendocument.text" == extracted(1)(4)) assert("9ef1aaee5c18cd16c47e75aaa38bd393" == extracted(1)(5)) - assert("https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.docx" == extracted(2)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.docx" == extracted( + 2 + )(0) + ) assert("test-aut-fixtures.docx" == extracted(2)(1)) assert("docx" == extracted(2)(2)) - assert("application/vnd.openxmlformats-officedocument.wordprocessingml.document" == extracted(2)(3)) - assert("application/vnd.openxmlformats-officedocument.wordprocessingml.document" == extracted(2)(4)) + assert( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" == extracted( + 2 + )(3) + ) + assert( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document" == extracted( + 2 + )(4) + ) assert("51040165e60629c6bf63c2bd40b9e628" == extracted(2)(5)) } diff --git a/src/test/scala/io/archivesunleashed/df/SaveMediaBytesTest.scala b/src/test/scala/io/archivesunleashed/df/SaveMediaBytesTest.scala index 6ea5203c..f67332ac 100644 --- a/src/test/scala/io/archivesunleashed/df/SaveMediaBytesTest.scala +++ b/src/test/scala/io/archivesunleashed/df/SaveMediaBytesTest.scala @@ -30,12 +30,18 @@ import java.io.File import java.nio.file.{Paths, Files} import java.util.Base64 -case class TestMediaDetails(url: String, extension: String, mime_type: String, - md5: String, bytes: String) +case class TestMediaDetails( + url: String, + extension: String, + mime_type: String, + md5: String, + bytes: String +) @RunWith(classOf[JUnitRunner]) class SaveMediaBytesTest extends FunSuite with BeforeAndAfter { - private val warcPath = Resources.getResource("warc/example.media.warc.gz").getPath + private val warcPath = + Resources.getResource("warc/example.media.warc.gz").getPath private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ @@ -50,11 +56,14 @@ class SaveMediaBytesTest extends FunSuite with BeforeAndAfter { } test("Save audio bytes to disk DF") { - val df = RecordLoader.loadArchives(warcPath, sc) + val df = RecordLoader + .loadArchives(warcPath, sc) .audio() - val extracted = df.select(testString, testExtension) - .orderBy(desc(testString)).limit(1) + val extracted = df + .select(testString, testExtension) + .orderBy(desc(testString)) + .limit(1) extracted.saveToDisk(testString, "/tmp/audio", testExtension) val encodedBytes: String = extracted.take(1)(0).getAs(testString) @@ -68,11 +77,19 @@ class SaveMediaBytesTest extends FunSuite with BeforeAndAfter { } test("Attempt to save invalid audio DF") { - val dummyEncBytes = Base64.getEncoder.encodeToString(Array.range(0, 127) - .map(_.toByte)) + val dummyEncBytes = Base64.getEncoder.encodeToString( + Array + .range(0, 127) + .map(_.toByte) + ) val dummyMD5 = ComputeMD5(dummyEncBytes.getBytes) - val dummyAudio = TestMediaDetails("http://example.com/fake.mp3", "mp3", - "audio/mpeg", dummyMD5, dummyEncBytes) + val dummyAudio = TestMediaDetails( + "http://example.com/fake.mp3", + "mp3", + "audio/mpeg", + dummyMD5, + dummyEncBytes + ) // For toDF(). val spark = SparkSession.builder().master("local").getOrCreate() @@ -84,8 +101,12 @@ class SaveMediaBytesTest extends FunSuite with BeforeAndAfter { df.saveToDisk(testString, "/tmp/bar", "extension") // Check that no file was written. - assert(new File("/tmp").listFiles.filter(_.isFile).toList - .count(_.getName.startsWith("bar-" + dummyMD5)) == 0) + assert( + new File("/tmp").listFiles + .filter(_.isFile) + .toList + .count(_.getName.startsWith("bar-" + dummyMD5)) == 0 + ) } after { diff --git a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala index d9309f7a..b39da1d9 100644 --- a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala +++ b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala @@ -40,7 +40,8 @@ class SimpleDfTest extends FunSuite with BeforeAndAfter { } test("Count records DF") { - val df = RecordLoader.loadArchives(arcPath, sc) + val df = RecordLoader + .loadArchives(arcPath, sc) .webpages() // We need this in order to use the $-notation @@ -49,8 +50,12 @@ class SimpleDfTest extends FunSuite with BeforeAndAfter { import spark.implicits._ // scalastyle:on - val results = df.select(extractDomain($"Url").as("Domain")) - .groupBy("Domain").count().orderBy(desc("count")).head(3) + val results = df + .select(extractDomain($"Url").as("Domain")) + .groupBy("Domain") + .count() + .orderBy(desc("count")) + .head(3) // Results should be: // +------------------+-----+ diff --git a/src/test/scala/io/archivesunleashed/df/UdfsTests.scala b/src/test/scala/io/archivesunleashed/df/UdfsTests.scala index a5044984..8a9c65ce 100644 --- a/src/test/scala/io/archivesunleashed/df/UdfsTests.scala +++ b/src/test/scala/io/archivesunleashed/df/UdfsTests.scala @@ -17,7 +17,13 @@ package io.archivesunleashed import com.google.common.io.Resources -import io.archivesunleashed.udfs.{computeImageSize, computeMD5, computeSHA1, extractImageLinks, getExtensionMime} +import io.archivesunleashed.udfs.{ + computeImageSize, + computeMD5, + computeSHA1, + extractImageLinks, + getExtensionMime +} import org.apache.spark.sql.functions.{desc, explode, unbase64} import org.apache.spark.sql.SparkSession import org.apache.spark.{SparkConf, SparkContext} @@ -39,8 +45,11 @@ class UdfsTest extends FunSuite with BeforeAndAfter { sc = new SparkContext(conf) } - test("DF Udf tests; computeSHA1, computeMD5, extractImageLinks, getExtensionMime") { - val df = RecordLoader.loadArchives(arcPath, sc) + test( + "DF Udf tests; computeSHA1, computeMD5, extractImageLinks, getExtensionMime" + ) { + val df = RecordLoader + .loadArchives(arcPath, sc) .webpages() // We need this in order to use the $-notation @@ -49,33 +58,57 @@ class UdfsTest extends FunSuite with BeforeAndAfter { import spark.implicits._ // scalastyle:on - val extracted = df.select($"url", $"mime_type_web_server", $"mime_type_tika", - computeSHA1($"content").as("sha1_test"), - computeMD5($"content").as("md5_test"), - explode(extractImageLinks($"url", $"content")).as("image_link"), - getExtensionMime($"url", $"mime_type_tika").as("extension")) - .orderBy(desc("md5_test")).head(4).toList + val extracted = df + .select( + $"url", + $"mime_type_web_server", + $"mime_type_tika", + computeSHA1($"content").as("sha1_test"), + computeMD5($"content").as("md5_test"), + explode(extractImageLinks($"url", $"content")).as("image_link"), + getExtensionMime($"url", $"mime_type_tika").as("extension") + ) + .orderBy(desc("md5_test")) + .head(4) + .toList assert(extracted.size == 4) - assert(extracted(0).get(0) == "http://www.archive.org/iathreads/post-view.php?id=186011") + assert( + extracted(0).get( + 0 + ) == "http://www.archive.org/iathreads/post-view.php?id=186011" + ) assert(extracted(0).get(1) == "text/html") assert(extracted(0).get(2) == "text/html") assert(extracted(0).get(3) == "9b9cd08e300f49ae59b1f2ced1bcd43fa8b5418c") assert(extracted(0).get(4) == "ff14be99e72943e85fe2368c1e65127a") - assert(extracted(0).get(5).toString == "[http://www.archive.org/iathreads/post-view.php?id=186011,http://www.archive.org/images/logo.jpg,(logo)]") + assert( + extracted(0) + .get(5) + .toString == "[http://www.archive.org/iathreads/post-view.php?id=186011,http://www.archive.org/images/logo.jpg,(logo)]" + ) assert(extracted(0).get(6) == "html") - assert(extracted(3).get(0) == "http://www.archive.org/iathreads/forum-display.php?poster=RipJarvis") + assert( + extracted(3).get( + 0 + ) == "http://www.archive.org/iathreads/forum-display.php?poster=RipJarvis" + ) assert(extracted(3).get(1) == "text/html") assert(extracted(3).get(2) == "text/html") assert(extracted(3).get(3) == "284a847892deaeb7790fe1b4123a9ccb47a246ed") assert(extracted(3).get(4) == "fe0c87b4db0ae846924c56f389083f39") - assert(extracted(3).get(5).toString == "[http://www.archive.org/iathreads/forum-display.php?poster=RipJarvis,http://www.archive.org/images/logo.jpg,(logo)]") + assert( + extracted(3) + .get(5) + .toString == "[http://www.archive.org/iathreads/forum-display.php?poster=RipJarvis,http://www.archive.org/images/logo.jpg,(logo)]" + ) assert(extracted(3).get(6) == "html") } test("DF Udf tests; computeImageSize, computeSHA1, computeMD5") { - val df = RecordLoader.loadArchives(arcPath, sc) + val df = RecordLoader + .loadArchives(arcPath, sc) .images() // We need this in order to use the $-notation @@ -84,13 +117,21 @@ class UdfsTest extends FunSuite with BeforeAndAfter { import spark.implicits._ // scalastyle:on - val extracted = df.select($"md5", $"sha1", $"height", $"width", - computeImageSize(unbase64($"bytes")).as("image_size"), - computeSHA1(unbase64($"bytes")).as("sha1_test"), - computeMD5(unbase64($"bytes")).as("md5_test")) - .withColumn("img_width", $"image_size._1") - .withColumn("img_height", $"image_size._2") - .orderBy(desc("md5")).head(2).toList + val extracted = df + .select( + $"md5", + $"sha1", + $"height", + $"width", + computeImageSize(unbase64($"bytes")).as("image_size"), + computeSHA1(unbase64($"bytes")).as("sha1_test"), + computeMD5(unbase64($"bytes")).as("md5_test") + ) + .withColumn("img_width", $"image_size._1") + .withColumn("img_height", $"image_size._2") + .orderBy(desc("md5")) + .head(2) + .toList assert(extracted.size == 2) assert(extracted(0).get(0) == "ff05f9b408519079c992202e8c8a14ee") diff --git a/src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala index 3b7981ca..ef1d5a9f 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ComputeImageSizeTest.scala @@ -30,17 +30,18 @@ import org.scalatest.junit.JUnitRunner class ComputeImageSizeTest extends FunSuite { val testImageSize = 10 var ios: ByteArrayOutputStream = new ByteArrayOutputStream(); - val img = new BufferedImage(testImageSize, testImageSize, BufferedImage.TYPE_INT_RGB) + val img = + new BufferedImage(testImageSize, testImageSize, BufferedImage.TYPE_INT_RGB) ImageIO.write(img, "png", ios) ios.flush() var image: Array[Byte] = ios.toByteArray(); ios.close() - test ("Check images and provide size RDD") { + test("Check images and provide size RDD") { val imageSize = (10, 10) val emptyImageSize = (0, 0) assert(ComputeImageSize(image) == imageSize) - assert(ComputeImageSize(Array[Byte](0,0,0)) == emptyImageSize) + assert(ComputeImageSize(Array[Byte](0, 0, 0)) == emptyImageSize) // scalastyle:off null assert(ComputeImageSize(null) == emptyImageSize) // scalastyle:on null diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala index 1f9a90bc..c036df3a 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala @@ -25,8 +25,8 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class ExtractBoilerPipeTextTest extends FunSuite { val header = "HTTP/1.0 200 OK Content-Type: text/html;" + - "charset=UTF-8 Expires: Fri, 20 Jul 2018 19:09:28 GMT Date:" + - "Fri, 20 Jul 2018 19:09:28 GMT Cache-Control: private,;\r\n\r\n" + "charset=UTF-8 Expires: Fri, 20 Jul 2018 19:09:28 GMT Date:" + + "Fri, 20 Jul 2018 19:09:28 GMT Cache-Control: private,;\r\n\r\n" var text = """

Text with a boiler plate.

Copyright 2017
""" var boiler = """Copyright 2017""" diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala index f2cffc4d..24c8f375 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractDateTest.scala @@ -16,7 +16,13 @@ package io.archivesunleashed.matchbox -import io.archivesunleashed.matchbox.ExtractDate.DateComponent.{DD, MM, YYYY, YYYYMM, YYYYMMDD} +import io.archivesunleashed.matchbox.ExtractDate.DateComponent.{ + DD, + MM, + YYYY, + YYYYMM, + YYYYMMDD +} import org.junit.runner.RunWith import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala index 077a69e9..fb0af680 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala @@ -28,19 +28,34 @@ class ExtractDomainTest extends FunSuite { private val lintool = "https://github.com/lintool" private val github = "github.com" - private val data1: Seq[(String, String)] = Seq.newBuilder.+=( - (jimmylin, umiacs), - (lintool, github), - ("http://ianmilligan.ca/2015/05/04/iipc-2015-slides-for-warcs-wats-and-wgets-presentation/", "ianmilligan.ca"), - (index, "")).result() + private val data1: Seq[(String, String)] = Seq.newBuilder + .+=( + (jimmylin, umiacs), + (lintool, github), + ( + "http://ianmilligan.ca/2015/05/04/iipc-2015-slides-for-warcs-wats-and-wgets-presentation/", + "ianmilligan.ca" + ), + (index, "") + ) + .result() - private val data2 = Seq.newBuilder.+=( - (index, jimmylin, umiacs), - (lintool, jimmylin, github), - (index, lintool, github)).result() + private val data2 = Seq.newBuilder + .+=( + (index, jimmylin, umiacs), + (lintool, jimmylin, github), + (index, lintool, github) + ) + .result() - private val data3 = Seq.newBuilder.+=( - ("http://www.seetorontonow.canada-booknow.com\\booking_results.php", "www.seetorontonow.canada-booknow.com")).result() + private val data3 = Seq.newBuilder + .+=( + ( + "http://www.seetorontonow.canada-booknow.com\\booking_results.php", + "www.seetorontonow.canada-booknow.com" + ) + ) + .result() test("Extract simple domain extraction RDD") { data1.foreach { diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala index 6c3ec63d..938b062c 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractImageLinksTest.scala @@ -27,7 +27,8 @@ class ExtractImageLinksTest extends FunSuite { test("Extract simple image links RDD") { val fragment: String = """Image here: picture and another baz banner""" - val extracted: Seq[(String, String, String)] = ExtractImageLinks("", fragment) + val extracted: Seq[(String, String, String)] = + ExtractImageLinks("", fragment) assert(extracted.size == 2) assert("http://foo.bar.com/pic.png" == extracted(0)._2) assert("picture" == extracted(0)._3) @@ -38,7 +39,8 @@ class ExtractImageLinksTest extends FunSuite { test("Extract relative image links RDD") { val fragment: String = """Image here: picture and another baz banner and LOGO""" - val extracted: Seq[(String, String, String)] = ExtractImageLinks("http://foo.bar.com/a/page.html", fragment) + val extracted: Seq[(String, String, String)] = + ExtractImageLinks("http://foo.bar.com/a/page.html", fragment) assert(extracted.size == 3) assert("http://foo.bar.com/a/pic.png" == extracted(0)._2) assert("picture" == extracted(0)._3) diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala index b2b8e339..e69321ba 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala @@ -28,8 +28,9 @@ import scala.collection.mutable @RunWith(classOf[JUnitRunner]) class ExtractLinksTest extends FunSuite { - val fragment: String = "Here is a search engine.\n" + - "Here is Twitter.\n" + val fragment: String = + "Here is a search engine.\n" + + "Here is Twitter.\n" val fooFragment: String = "http://www.foobar.org/index.html" val url = "http://www.google.com" val twitter = "http://www.twitter.com/" @@ -46,9 +47,10 @@ class ExtractLinksTest extends FunSuite { test("Extract relative links RDD") { val fragmentLocal: String = "Here is " + - "a search engine.\nHere is a a relative URL.\n" + "a search engine.\nHere is a a relative URL.\n" val fooFragmentLocal = "http://www.foobar.org/page.html" - val extracted: Seq[(String, String, String)] = ExtractLinks("", fragmentLocal, fooFragment) + val extracted: Seq[(String, String, String)] = + ExtractLinks("", fragmentLocal, fooFragment) assert(extracted.size == 2) assert(url == extracted.head._2) assert(head == extracted.head._3) @@ -58,10 +60,17 @@ class ExtractLinksTest extends FunSuite { test("Test link errors RDD") { val bytes: Array[Byte] = "wronglyTyped".getBytes() - val invalid: String = "Here is a fake url bogus search engine." + val invalid: String = + "Here is a fake url bogus search engine." // scalastyle:off null - assert(ExtractLinks(null, fragment, fooFragment) == mutable.MutableList[(String, String, String)]()) + assert( + ExtractLinks(null, fragment, fooFragment) == mutable + .MutableList[(String, String, String)]() + ) // scalastyle:on null - assert(ExtractLinks("", "", fooFragment) == mutable.MutableList[(String, String, String)]()) + assert( + ExtractLinks("", "", fooFragment) == mutable + .MutableList[(String, String, String)]() + ) } } diff --git a/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala b/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala index 55fa8520..66f64454 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala @@ -27,7 +27,8 @@ import org.scalatest.{BeforeAndAfter, FunSuite} @RunWith(classOf[JUnitRunner]) class GetExtensionMIMETest extends FunSuite with BeforeAndAfter { - private val warcPath = Resources.getResource("warc/example.media.warc.gz").getPath + private val warcPath = + Resources.getResource("warc/example.media.warc.gz").getPath private val master = "local[4]" private val appName = "example-df" private var sc: SparkContext = _ @@ -42,14 +43,28 @@ class GetExtensionMIMETest extends FunSuite with BeforeAndAfter { } test("Get extension of file from URL with no extension") { - df = RecordLoader.loadArchives(warcPath, sc) + df = RecordLoader + .loadArchives(warcPath, sc) .images() - extracted = df.select("url", "filename", "extension", - "mime_type_web_server", "mime_type_tika", "md5") - .orderBy(desc("md5")).head(3).toList + extracted = df + .select( + "url", + "filename", + "extension", + "mime_type_web_server", + "mime_type_tika", + "md5" + ) + .orderBy(desc("md5")) + .head(3) + .toList assert(extracted.size == 3) - assert("https://ruebot.net/files/aut-test-fixtures/this_is_a_gif" == extracted(0)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/this_is_a_gif" == extracted( + 0 + )(0) + ) assert("this_is_a_gif" == extracted(0)(1)) assert("gif" == extracted(0)(2)) assert("unknown" == extracted(0)(3)) @@ -58,7 +73,11 @@ class GetExtensionMIMETest extends FunSuite with BeforeAndAfter { } test("Get extension of file from URL with correct extension") { - assert("https://ruebot.net/files/aut-test-fixtures/real_png.png" == extracted(1)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/real_png.png" == extracted(1)( + 0 + ) + ) assert("real_png.png" == extracted(1)(1)) assert("png" == extracted(1)(2)) assert("image/png" == extracted(1)(3)) @@ -67,7 +86,11 @@ class GetExtensionMIMETest extends FunSuite with BeforeAndAfter { } test("Get extension of file from URL with incorrect extension") { - assert("https://ruebot.net/files/aut-test-fixtures/this_is_a_jpeg.mp3" == extracted(2)(0)) + assert( + "https://ruebot.net/files/aut-test-fixtures/this_is_a_jpeg.mp3" == extracted( + 2 + )(0) + ) assert("this_is_a_jpeg.mp3" == extracted(2)(1)) assert("jpg" == extracted(2)(2)) assert("audio/mpeg" == extracted(2)(3)) diff --git a/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala b/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala index 6d2d81d0..7635a867 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/StringUtilsTest.scala @@ -39,17 +39,19 @@ class StringUtilsTest extends FunSuite { val except: String = null; // scalastyle:on null assert(invalid.escapeInvalidXML() == "A<B>C&D""); - val caught = intercept[IOException] {except.escapeInvalidXML()} - assert (caught.getMessage == "Caught exception processing input row "); + val caught = intercept[IOException] { except.escapeInvalidXML() } + assert(caught.getMessage == "Caught exception processing input row "); } - test ("MD5 hash") { + test("MD5 hash") { val s: String = "unesco.org"; assert(ComputeMD5(s.getBytes) == "8e8decc8e8107bcf9d3896f3222b77d8"); } - test ("SHA1 hash") { + test("SHA1 hash") { val s: String = "unesco.org"; - assert(ComputeSHA1(s.getBytes) == "2d0e5377157172045d87befe46e157cda42c4f6e"); + assert( + ComputeSHA1(s.getBytes) == "2d0e5377157172045d87befe46e157cda42c4f6e" + ); } }