archivesunleashed · ruebot · Aug 1, 2018 · Jul 30, 2018 · Jul 30, 2018 · Jul 31, 2018
diff --git a/src/main/scala/io/archivesunleashed/ArchiveRecord.scala b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala
@@ -62,66 +62,73 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends
   var arcRecord: ARCRecord = null
   var warcRecord: WARCRecord = null
 
-  if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
+  if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
     arcRecord = r.t.getRecord.asInstanceOf[ARCRecord]
-  else if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.WARC)
-    warcRecord = r.t.getRecord.asInstanceOf[WARCRecord]
-
+  } else if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.WARC) {
+      warcRecord = r.t.getRecord.asInstanceOf[WARCRecord]
+  }
   val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX")
 
   val getCrawlDate: String = {
-    if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
+    if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC){
       ExtractDate(arcRecord.getMetaData.getDate, ExtractDate.DateComponent.YYYYMMDD)
-    else
+    } else {
       ExtractDate(
         ArchiveUtils.get14DigitDate(
           ISO8601.parse(warcRecord.getHeader.getDate)), ExtractDate.DateComponent.YYYYMMDD)
+    }
   }
 
   val getCrawlMonth: String = {
-    if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
+    if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
       ExtractDate(arcRecord.getMetaData.getDate, ExtractDate.DateComponent.YYYYMM)
-    else
+    } else {
       ExtractDate(
         ArchiveUtils.get14DigitDate(
           ISO8601.parse(warcRecord.getHeader.getDate)), ExtractDate.DateComponent.YYYYMM)
+    }
   }
 
   val getContentBytes: Array[Byte] = {
     if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
+    {
       ArcRecordUtils.getBodyContent(arcRecord)
-    else
+    } else {
       WarcRecordUtils.getContent(warcRecord)
+    }
   }
 
   val getContentString: String = {
     new String(getContentBytes)
   }
 
   val getMimeType: String = {
-    if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
+    if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
       arcRecord.getMetaData.getMimetype
-    else
+    } else {
       WarcRecordUtils.getWarcResponseMimeType(getContentBytes)
+    }
   }
 
   val getUrl: String = {
-    if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
+    if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
       arcRecord.getMetaData.getUrl
-    else
+    } else {
       warcRecord.getHeader.getUrl
+    }
   }
 
   val getDomain: String = {
     ExtractDomain(getUrl)
   }
 
   val getImageBytes: Array[Byte] = {
-    if (getContentString.startsWith("HTTP/"))
+    if (getContentString.startsWith("HTTP/")) {
       getContentBytes.slice(
         getContentString.indexOf(RemoveHttpHeader.headerEnd)
           + RemoveHttpHeader.headerEnd.length, getContentBytes.length)
-    else
+    } else {
       getContentBytes
+    }
   }
 }
diff --git a/src/main/scala/io/archivesunleashed/DataFrameLoader.scala b/src/main/scala/io/archivesunleashed/DataFrameLoader.scala
@@ -1,7 +1,26 @@
-package io.archivesunleashed
+/*
+ * Archives Unleashed Toolkit (AUT):
+ * An open-source platform for analyzing web archives.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ package io.archivesunleashed
 
 import org.apache.spark.SparkContext
+// scalastyle:off underscore.import
 import org.apache.spark.sql._
+// scalastyle:on underscore.import
 
 class DataFrameLoader(sc: SparkContext) {
   def extractValidPages(path: String): DataFrame = {
@@ -16,8 +35,8 @@ class DataFrameLoader(sc: SparkContext) {
 
   /* Create a dataframe with (source page, image url) pairs */
   def extractImageLinks(path: String): DataFrame = {
-  	RecordLoader.loadArchives(path, sc)
-  		.extractImageLinksDF()
+    RecordLoader.loadArchives(path, sc)
+      .extractImageLinksDF()
   }
 
   /** Create a dataframe with (image url, type, width, height, md5, raw bytes) pairs */

diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
@@ -30,7 +30,7 @@ import org.rogach.scallop.exceptions.ScallopException
 
 /* Usage:
  *
- * PATH_TO_SPARK 
+ * PATH_TO_SPARK
  *   --class io.archivesunleashed.app.CommandLinAppRunner
  *   PATH_TO_AUT_JAR
  *   --extractor EXTRACTOR
@@ -40,7 +40,7 @@ import org.rogach.scallop.exceptions.ScallopException
  *   [--df]
  *   [--split]
  *   [--partiton]
- *   
+ *
  * where EXTRACTOR is one of
  * DomainFrequencyExtractor, DomainGraphExtractor or PlainTextExtractor
  *
@@ -72,9 +72,11 @@ class CmdAppConf(args: Seq[String]) extends ScallopConf(args) {
     */
   override def onError(e: Throwable): Unit = e match {
     case ScallopException(message) =>
+      // scalastyle:off
       println(message)
+      // scalastyle:on
       throw new IllegalArgumentException()
-    case other => throw other
+    case other: Any => throw other
   }
 
   mainOptions = Seq(input, output)
@@ -205,7 +207,7 @@ class CommandLineApp(conf: CmdAppConf) {
     * @throws IllegalArgumentException exception thrown
     */
 
-  def verifyArgumentsOrExit() = {
+  def verifyArgumentsOrExit(): Unit = {
     configuration.input() foreach { f =>
       if (!Files.exists(Paths.get(f))) {
         logger.error(f + " not found")
@@ -223,7 +225,7 @@ class CommandLineApp(conf: CmdAppConf) {
     * @return Any
     */
 
-  def dfHandler() = {
+  def dfHandler(): Any = {
     if (!(dfExtractors contains configuration.extractor())) {
       logger.error(configuration.extractor() + " not supported with data frame. " +
         "The following extractors are supported: ")
@@ -254,9 +256,10 @@ class CommandLineApp(conf: CmdAppConf) {
     * @return Any
     */
 
-  def rddHandler() = {
+  def rddHandler(): Any = {
     if (!(rddExtractors contains configuration.extractor())) {
-      logger.error(configuration.extractor() + " not supported with RDD. The following extractors are supported: ")
+      logger.error(configuration.extractor() +
+      " not supported with RDD. The following extractors are supported: ")
       rddExtractors foreach { tuple => logger.error(tuple._1) }
       throw new IllegalArgumentException()
     }
@@ -290,7 +293,7 @@ class CommandLineApp(conf: CmdAppConf) {
     *
     * @return Any
     */
-  def process() = {
+  def process(): Any = {
     if (!configuration.df.isEmpty && configuration.df()) {
       dfHandler()
     } else {

diff --git a/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala b/src/main/scala/io/archivesunleashed/app/ExtractEntities.scala
@@ -16,8 +16,10 @@
  */
 package io.archivesunleashed.app
 
+// scalastyle:off underscore.import
 import io.archivesunleashed._
-import io.archivesunleashed.matchbox.{NER3Classifier, RemoveHTML}
+// scalastyle:on underscore.import
+import io.archivesunleashed.matchbox.{NERClassifier, RemoveHTML}
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 
@@ -72,8 +74,8 @@ object ExtractEntities {
     */
   def extractAndOutput(iNerClassifierFile: String, rdd: RDD[(String, String, String)], outputFile: String): RDD[(String, String, String)] = {
     val r = rdd.mapPartitions(iter => {
-      NER3Classifier.apply(iNerClassifierFile)
-      iter.map(r => (r._1, r._2, NER3Classifier.classify(r._3)))
+      NERClassifier.apply(iNerClassifierFile)
+      iter.map(r => (r._1, r._2, NERClassifier.classify(r._3)))
     })
     r.saveAsTextFile(outputFile)
     r

diff --git a/src/main/scala/io/archivesunleashed/app/ExtractGraph.scala b/src/main/scala/io/archivesunleashed/app/ExtractGraph.scala
@@ -19,7 +19,9 @@ package io.archivesunleashed.app
 import io.archivesunleashed.ArchiveRecord
 import io.archivesunleashed.matchbox.{ExtractLinks, ExtractDomain, WWWLink}
 import io.archivesunleashed.util.JsonUtils
+// scalastyle:off underscore.import
 import org.apache.spark.graphx._
+// scalastyle:on underscore.import
 import org.apache.spark.rdd.RDD
 
 /** Extracts a network graph using Spark's GraphX utility. */

diff --git a/src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala b/src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala
@@ -16,9 +16,11 @@
  */
 package io.archivesunleashed.app
 
+// scalastyle:off underscore.import
 import io.archivesunleashed._
-import io.archivesunleashed.matchbox._
+import io.archivesunleashed.matchbox
 import org.apache.spark.graphx._
+// scalastyle:on underscore.import
 import org.apache.spark.rdd.RDD
 
 /** Extracts a site link structure using Spark's GraphX utility. */

diff --git a/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala b/src/main/scala/io/archivesunleashed/app/ExtractPopularImages.scala
@@ -16,7 +16,9 @@
  */
 package io.archivesunleashed.app
 
+// scalastyle:off underscore.import
 import io.archivesunleashed._
+// scalastyle:on underscore.import
 import io.archivesunleashed.matchbox.{ComputeImageSize, ComputeMD5}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.{RangePartitioner, SparkContext}
@@ -45,7 +47,6 @@ object ExtractPopularImages {
       .reduceByKey((image1, image2) => (image1._1, image1._2, image1._3 + image2._3))
       .map(x=> (x._2._3, x._2._2))
       .takeOrdered(limit)(Ordering[Int].on(x => -x._1))
-    res.foreach(x => println(x._1 + "\t" + x._2))
     val numPartitions = if (limit <= LIMIT_MAXIMUM) 1 else Math.ceil(limit / LIMIT_DENOMINATOR).toInt
     val rdd = sc.parallelize(res)
     rdd.repartitionAndSortWithinPartitions(

diff --git a/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala b/src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala
@@ -18,10 +18,12 @@ package io.archivesunleashed.app
 
 import java.io.{BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter}
 
-import io.archivesunleashed.matchbox.NER3Classifier
+import io.archivesunleashed.matchbox.NERClassifier
 import io.archivesunleashed.util.JsonUtils
 import org.apache.hadoop.conf.Configuration
+// scalastyle:off underscore.import
 import org.apache.hadoop.fs._
+// scalastyle:on underscore.import
 import org.apache.spark.SparkContext
 
 import scala.collection.mutable.MutableList
@@ -42,19 +44,20 @@ class NERCombinedJson extends Serializable {
     }.toList
   }
 
-  /** Combines directory of part-files containing one JSON array per line into a single file containing a single JSON array of arrays.
+  /** Combines directory of part-files containing one JSON array per line into a
+    * single file containing a single JSON array of arrays.
     *
     * @param srcDir name of directory holding files, also name that will
     *               be given to JSON file
     * @return Unit().
     */
   def partDirToFile(srcDir: String): Unit = {
+    val randomSample = 8
     val hadoopConfig = new Configuration()
     val hdfs = FileSystem.get(hadoopConfig)
     val rnd = new Random
-
     val srcPath = new Path(srcDir)
-    val tmpFile = rnd.alphanumeric.take(8).mkString + ".almostjson"
+    val tmpFile = rnd.alphanumeric.take(randomSample).mkString + ".almostjson"
     val tmpPath = new Path(tmpFile)
 
     // Merge part-files into single file.
@@ -86,16 +89,17 @@ class NERCombinedJson extends Serializable {
     * @param outputFile path of output file (e.g., "entities.json")
     * @param sc Spark context object
     */
-  def classify(iNerClassifierFile: String, inputFile: String, outputFile: String, sc: SparkContext) {
+  def classify(iNerClassifierFile: String, inputFile: String, outputFile: String,
+    sc: SparkContext): Unit = {
     val out = sc.textFile(inputFile)
       .mapPartitions(iter => {
-        NER3Classifier.apply(iNerClassifierFile)
+        NERClassifier.apply(iNerClassifierFile)
         iter.map(line => {
             val substrs = line.split(",", 3)
             (substrs(0), substrs(1), substrs(2))
           })
           .map(r => {
-            val classifiedJson = NER3Classifier.classify(r._3)
+            val classifiedJson = NERClassifier.classify(r._3)
             val classifiedMap = JsonUtils.fromJson(classifiedJson)
             val classifiedMapCountTuples: Map[String, List[(String, Int)]] = classifiedMap.map {
               case (nerType, entities: List[String @unchecked]) => (nerType, entities.groupBy(identity).mapValues(_.size).toList)

diff --git a/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala b/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala
@@ -29,7 +29,7 @@ object PlainTextExtractor {
     * @return RDD[(String, String, String, String)], which holds
     *         (CrawlDate, Domain, Url, Text)
     */
-  def apply(records: RDD[ArchiveRecord]) = {
+  def apply(records: RDD[ArchiveRecord]): RDD[(String, String, String, String)] = {
     records
       .keepValidPages()
       .map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(r.getContentString)))
@@ -42,8 +42,9 @@ object PlainTextExtractor {
     */
   def apply(d: DataFrame): Dataset[Row] = {
     val spark = SparkSession.builder().master("local").getOrCreate()
+    // scalastyle:off
     import spark.implicits._
-
+    // scalastyle:on
     d.select($"CrawlDate", df.ExtractBaseDomain($"Url").as("Domain"),
       $"Url", df.RemoveHTML($"Content").as("Text"))
   }