Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Address main scalastyle errors - #196 #248

Merged
merged 6 commits into from
Aug 1, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 22 additions & 15 deletions src/main/scala/io/archivesunleashed/ArchiveRecord.scala
Original file line number Diff line number Diff line change
Expand Up @@ -62,66 +62,73 @@ class ArchiveRecordImpl(r: SerializableWritable[ArchiveRecordWritable]) extends
var arcRecord: ARCRecord = null
var warcRecord: WARCRecord = null

if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
arcRecord = r.t.getRecord.asInstanceOf[ARCRecord]
else if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.WARC)
warcRecord = r.t.getRecord.asInstanceOf[WARCRecord]

} else if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.WARC) {
warcRecord = r.t.getRecord.asInstanceOf[WARCRecord]
}
val ISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX")

val getCrawlDate: String = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC){
ExtractDate(arcRecord.getMetaData.getDate, ExtractDate.DateComponent.YYYYMMDD)
else
} else {
ExtractDate(
ArchiveUtils.get14DigitDate(
ISO8601.parse(warcRecord.getHeader.getDate)), ExtractDate.DateComponent.YYYYMMDD)
}
}

val getCrawlMonth: String = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
ExtractDate(arcRecord.getMetaData.getDate, ExtractDate.DateComponent.YYYYMM)
else
} else {
ExtractDate(
ArchiveUtils.get14DigitDate(
ISO8601.parse(warcRecord.getHeader.getDate)), ExtractDate.DateComponent.YYYYMM)
}
}

val getContentBytes: Array[Byte] = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
{
ArcRecordUtils.getBodyContent(arcRecord)
else
} else {
WarcRecordUtils.getContent(warcRecord)
}
}

val getContentString: String = {
new String(getContentBytes)
}

val getMimeType: String = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
arcRecord.getMetaData.getMimetype
else
} else {
WarcRecordUtils.getWarcResponseMimeType(getContentBytes)
}
}

val getUrl: String = {
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC)
if (r.t.getFormat == ArchiveRecordWritable.ArchiveFormat.ARC) {
arcRecord.getMetaData.getUrl
else
} else {
warcRecord.getHeader.getUrl
}
}

val getDomain: String = {
ExtractDomain(getUrl)
}

val getImageBytes: Array[Byte] = {
if (getContentString.startsWith("HTTP/"))
if (getContentString.startsWith("HTTP/")) {
getContentBytes.slice(
getContentString.indexOf(RemoveHttpHeader.headerEnd)
+ RemoveHttpHeader.headerEnd.length, getContentBytes.length)
else
} else {
getContentBytes
}
}
}
25 changes: 22 additions & 3 deletions src/main/scala/io/archivesunleashed/DataFrameLoader.scala
Original file line number Diff line number Diff line change
@@ -1,7 +1,26 @@
package io.archivesunleashed
/*
* Archives Unleashed Toolkit (AUT):
* An open-source platform for analyzing web archives.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed

import org.apache.spark.SparkContext
// scalastyle:off underscore.import
import org.apache.spark.sql._
// scalastyle:on underscore.import

class DataFrameLoader(sc: SparkContext) {
def extractValidPages(path: String): DataFrame = {
Expand All @@ -16,8 +35,8 @@ class DataFrameLoader(sc: SparkContext) {

/* Create a dataframe with (source page, image url) pairs */
def extractImageLinks(path: String): DataFrame = {
RecordLoader.loadArchives(path, sc)
.extractImageLinksDF()
RecordLoader.loadArchives(path, sc)
.extractImageLinksDF()
}

/** Create a dataframe with (image url, type, width, height, md5, raw bytes) pairs */
Expand Down
19 changes: 11 additions & 8 deletions src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import org.rogach.scallop.exceptions.ScallopException

/* Usage:
*
* PATH_TO_SPARK
* PATH_TO_SPARK
* --class io.archivesunleashed.app.CommandLinAppRunner
* PATH_TO_AUT_JAR
* --extractor EXTRACTOR
Expand All @@ -40,7 +40,7 @@ import org.rogach.scallop.exceptions.ScallopException
* [--df]
* [--split]
* [--partiton]
*
*
* where EXTRACTOR is one of
* DomainFrequencyExtractor, DomainGraphExtractor or PlainTextExtractor
*
Expand Down Expand Up @@ -72,9 +72,11 @@ class CmdAppConf(args: Seq[String]) extends ScallopConf(args) {
*/
override def onError(e: Throwable): Unit = e match {
case ScallopException(message) =>
// scalastyle:off
println(message)
// scalastyle:on
throw new IllegalArgumentException()
case other => throw other
case other: Any => throw other
}

mainOptions = Seq(input, output)
Expand Down Expand Up @@ -205,7 +207,7 @@ class CommandLineApp(conf: CmdAppConf) {
* @throws IllegalArgumentException exception thrown
*/

def verifyArgumentsOrExit() = {
def verifyArgumentsOrExit(): Unit = {
configuration.input() foreach { f =>
if (!Files.exists(Paths.get(f))) {
logger.error(f + " not found")
Expand All @@ -223,7 +225,7 @@ class CommandLineApp(conf: CmdAppConf) {
* @return Any
*/

def dfHandler() = {
def dfHandler(): Any = {
if (!(dfExtractors contains configuration.extractor())) {
logger.error(configuration.extractor() + " not supported with data frame. " +
"The following extractors are supported: ")
Expand Down Expand Up @@ -254,9 +256,10 @@ class CommandLineApp(conf: CmdAppConf) {
* @return Any
*/

def rddHandler() = {
def rddHandler(): Any = {
if (!(rddExtractors contains configuration.extractor())) {
logger.error(configuration.extractor() + " not supported with RDD. The following extractors are supported: ")
logger.error(configuration.extractor() +
" not supported with RDD. The following extractors are supported: ")
rddExtractors foreach { tuple => logger.error(tuple._1) }
throw new IllegalArgumentException()
}
Expand Down Expand Up @@ -290,7 +293,7 @@ class CommandLineApp(conf: CmdAppConf) {
*
* @return Any
*/
def process() = {
def process(): Any = {
if (!configuration.df.isEmpty && configuration.df()) {
dfHandler()
} else {
Expand Down
8 changes: 5 additions & 3 deletions src/main/scala/io/archivesunleashed/app/ExtractEntities.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
*/
package io.archivesunleashed.app

// scalastyle:off underscore.import
import io.archivesunleashed._
import io.archivesunleashed.matchbox.{NER3Classifier, RemoveHTML}
// scalastyle:on underscore.import
import io.archivesunleashed.matchbox.{NERClassifier, RemoveHTML}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

Expand Down Expand Up @@ -72,8 +74,8 @@ object ExtractEntities {
*/
def extractAndOutput(iNerClassifierFile: String, rdd: RDD[(String, String, String)], outputFile: String): RDD[(String, String, String)] = {
val r = rdd.mapPartitions(iter => {
NER3Classifier.apply(iNerClassifierFile)
iter.map(r => (r._1, r._2, NER3Classifier.classify(r._3)))
NERClassifier.apply(iNerClassifierFile)
iter.map(r => (r._1, r._2, NERClassifier.classify(r._3)))
})
r.saveAsTextFile(outputFile)
r
Expand Down
2 changes: 2 additions & 0 deletions src/main/scala/io/archivesunleashed/app/ExtractGraph.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ package io.archivesunleashed.app
import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.matchbox.{ExtractLinks, ExtractDomain, WWWLink}
import io.archivesunleashed.util.JsonUtils
// scalastyle:off underscore.import
import org.apache.spark.graphx._
// scalastyle:on underscore.import
import org.apache.spark.rdd.RDD

/** Extracts a network graph using Spark's GraphX utility. */
Expand Down
4 changes: 3 additions & 1 deletion src/main/scala/io/archivesunleashed/app/ExtractGraphX.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,11 @@
*/
package io.archivesunleashed.app

// scalastyle:off underscore.import
import io.archivesunleashed._
import io.archivesunleashed.matchbox._
import io.archivesunleashed.matchbox
import org.apache.spark.graphx._
// scalastyle:on underscore.import
import org.apache.spark.rdd.RDD

/** Extracts a site link structure using Spark's GraphX utility. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
*/
package io.archivesunleashed.app

// scalastyle:off underscore.import
import io.archivesunleashed._
// scalastyle:on underscore.import
import io.archivesunleashed.matchbox.{ComputeImageSize, ComputeMD5}
import org.apache.spark.rdd.RDD
import org.apache.spark.{RangePartitioner, SparkContext}
Expand Down Expand Up @@ -45,7 +47,6 @@ object ExtractPopularImages {
.reduceByKey((image1, image2) => (image1._1, image1._2, image1._3 + image2._3))
.map(x=> (x._2._3, x._2._2))
.takeOrdered(limit)(Ordering[Int].on(x => -x._1))
res.foreach(x => println(x._1 + "\t" + x._2))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this being removed? Is it just printing to a stdout?

Copy link
Contributor Author

@greebie greebie Jul 31, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah - it was just printing.
Scalastyle rejects all println.
I can restore it and use a // scalastyle:off if it's necessary, however.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if @ianmilligan1 can speak to usefulness of a popular images printout here if we already have the RDD of images available (i.e. they can be mapped through and printed out in scripts if needed)?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just ran it on both master and your branch, @greebie, and yes - probably that println was in there for debugging. The saveAsTextFile output is identical to the printed lines, so we were getting them twice. I think this is a good alteration.

val numPartitions = if (limit <= LIMIT_MAXIMUM) 1 else Math.ceil(limit / LIMIT_DENOMINATOR).toInt
val rdd = sc.parallelize(res)
rdd.repartitionAndSortWithinPartitions(
Expand Down
18 changes: 11 additions & 7 deletions src/main/scala/io/archivesunleashed/app/NERCombinedJson.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,12 @@ package io.archivesunleashed.app

import java.io.{BufferedReader, BufferedWriter, InputStreamReader, OutputStreamWriter}

import io.archivesunleashed.matchbox.NER3Classifier
import io.archivesunleashed.matchbox.NERClassifier
import io.archivesunleashed.util.JsonUtils
import org.apache.hadoop.conf.Configuration
// scalastyle:off underscore.import
import org.apache.hadoop.fs._
// scalastyle:on underscore.import
import org.apache.spark.SparkContext

import scala.collection.mutable.MutableList
Expand All @@ -42,19 +44,20 @@ class NERCombinedJson extends Serializable {
}.toList
}

/** Combines directory of part-files containing one JSON array per line into a single file containing a single JSON array of arrays.
/** Combines directory of part-files containing one JSON array per line into a
* single file containing a single JSON array of arrays.
*
* @param srcDir name of directory holding files, also name that will
* be given to JSON file
* @return Unit().
*/
def partDirToFile(srcDir: String): Unit = {
val randomSample = 8
val hadoopConfig = new Configuration()
val hdfs = FileSystem.get(hadoopConfig)
val rnd = new Random

val srcPath = new Path(srcDir)
val tmpFile = rnd.alphanumeric.take(8).mkString + ".almostjson"
val tmpFile = rnd.alphanumeric.take(randomSample).mkString + ".almostjson"
val tmpPath = new Path(tmpFile)

// Merge part-files into single file.
Expand Down Expand Up @@ -86,16 +89,17 @@ class NERCombinedJson extends Serializable {
* @param outputFile path of output file (e.g., "entities.json")
* @param sc Spark context object
*/
def classify(iNerClassifierFile: String, inputFile: String, outputFile: String, sc: SparkContext) {
def classify(iNerClassifierFile: String, inputFile: String, outputFile: String,
sc: SparkContext): Unit = {
val out = sc.textFile(inputFile)
.mapPartitions(iter => {
NER3Classifier.apply(iNerClassifierFile)
NERClassifier.apply(iNerClassifierFile)
iter.map(line => {
val substrs = line.split(",", 3)
(substrs(0), substrs(1), substrs(2))
})
.map(r => {
val classifiedJson = NER3Classifier.classify(r._3)
val classifiedJson = NERClassifier.classify(r._3)
val classifiedMap = JsonUtils.fromJson(classifiedJson)
val classifiedMapCountTuples: Map[String, List[(String, Int)]] = classifiedMap.map {
case (nerType, entities: List[String @unchecked]) => (nerType, entities.groupBy(identity).mapValues(_.size).toList)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ object PlainTextExtractor {
* @return RDD[(String, String, String, String)], which holds
* (CrawlDate, Domain, Url, Text)
*/
def apply(records: RDD[ArchiveRecord]) = {
def apply(records: RDD[ArchiveRecord]): RDD[(String, String, String, String)] = {
records
.keepValidPages()
.map(r => (r.getCrawlDate, r.getDomain, r.getUrl, RemoveHTML(r.getContentString)))
Expand All @@ -42,8 +42,9 @@ object PlainTextExtractor {
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
// scalastyle:off
import spark.implicits._

// scalastyle:on
d.select($"CrawlDate", df.ExtractBaseDomain($"Url").as("Domain"),
$"Url", df.RemoveHTML($"Content").as("Text"))
}
Expand Down
Loading