Skip to content

Commit

Permalink
Add scalafix and remove unused imports. (#548)
Browse files Browse the repository at this point in the history
  • Loading branch information
ruebot committed Nov 8, 2022
1 parent cdf8e76 commit 24bb5e5
Show file tree
Hide file tree
Showing 46 changed files with 25 additions and 92 deletions.
13 changes: 13 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,15 @@
<arg>-feature</arg>
<arg>-explaintypes</arg>
<arg>-target:jvm-1.8</arg>
<arg>-Ywarn-unused-import</arg>
</args>
<compilerPlugins>
<compilerPlugin>
<groupId>org.scalameta</groupId>
<artifactId>semanticdb-scalac_${scala.version}</artifactId>
<version>4.6.0</version>
</compilerPlugin>
</compilerPlugins>
</configuration>
</plugin>
<!-- For license header enforcement. -->
Expand Down Expand Up @@ -328,6 +336,11 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>io.github.evis</groupId>
<artifactId>scalafix-maven-plugin_${scala.binary.version}</artifactId>
<version>0.1.7_0.10.4</version>
</plugin>
</plugins>
</build>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,7 @@ import io.archivesunleashed.matchbox.ExtractDomain
import org.apache.tika.io.BoundedInputStream
import org.archive.webservices.sparkling.http.HttpMessage
import org.archive.webservices.sparkling.io.IOUtil
import org.archive.webservices.sparkling.util.{
ManagedVal,
RegexUtil,
ValueSupplier
}
import org.archive.webservices.sparkling.util.{ManagedVal, ValueSupplier}
import org.archive.webservices.sparkling.warc.{WarcHeaders, WarcRecord}
import scala.util.Try

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object AudioInformationExtractor {
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ package io.archivesunleashed.app
import java.io.File
import java.nio.file.{Files, Paths}

import io.archivesunleashed.{ArchiveRecord, RecordLoader}
import io.archivesunleashed.RecordLoader
import org.apache.log4j.Logger
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.{SparkConf, SparkContext}
import org.rogach.scallop.exceptions.ScallopException
import org.rogach.scallop.ScallopConf
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object CssInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,7 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import io.archivesunleashed.udfs.{extractDomain, removePrefixWWW}
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object DomainFrequencyExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import io.archivesunleashed.udfs.{extractDomain, removePrefixWWW}
import org.apache.spark.sql.functions.{desc, substring}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@
*/
package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.functions.{col, desc, first}
import org.apache.spark.sql.functions.{desc, first}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.{RangePartitioner, SparkContext}

/** Extract the most popular images from a DataFrame. */
object ExtractPopularImagesDF {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object HtmlInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object ImageGraphExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object ImageInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object JsInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object JsonInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object PDFInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.udfs.{extractBoilerpipeText}
import io.archivesunleashed.udfs.extractBoilerpipeText
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.functions.lower
import scala.language.postfixOps
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object PlainTextInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object PresentationProgramInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object SpreadsheetInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object VideoInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object WebGraphExtractor {
Expand Down
10 changes: 0 additions & 10 deletions src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.udfs.{
extractDomain,
removeHTML,
removeHTTPHeader,
removePrefixWWW
}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object WebPagesExtractor {
Expand All @@ -35,9 +28,6 @@ object WebPagesExtractor {
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
// scalastyle:off
import spark.implicits._
// scalastyle:on
d
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object WordProcessorInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object XmlInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
package io.archivesunleashed.matchbox

import de.l3s.boilerpipe.extractors.DefaultExtractor
import java.io.IOException

/** Extract raw text content from an HTML page, minus "boilerplate" content (using boilerpipe). */
object ExtractBoilerpipeText {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
*/
package io.archivesunleashed.matchbox

import java.io.IOException
import org.jsoup.Jsoup
import org.jsoup.select.Elements
import scala.collection.mutable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@
*/
package io.archivesunleashed.matchbox

import java.io.ByteArrayInputStream
import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.ParseContext
import org.apache.tika.parser.pdf.PDFParser
import org.apache.tika.sax.BodyContentHandler;

/** Exacts texts from PDFs using Apache Tika. */
object ExtractTextFromPDFs {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
*/
package io.archivesunleashed.matchbox

import java.io.IOException
import org.jsoup.Jsoup

/** Removes HTML markup with JSoup. */
Expand Down
1 change: 0 additions & 1 deletion src/main/scala/io/archivesunleashed/matchbox/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package io.archivesunleashed

import java.io.IOException
import java.security.MessageDigest
import scala.xml.Utility.escape

/** Package object which supplies implicits providing common UDF-related functionalities. */
Expand Down
9 changes: 3 additions & 6 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -35,36 +35,33 @@ import io.archivesunleashed.matchbox.{
}
import io.archivesunleashed.matchbox.ExtractDate.DateComponent
import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent
import java.net.URI
import java.net.URL

import org.apache.commons.codec.binary.Hex
import org.apache.commons.io.FilenameUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.{lit, lower, udf}
import org.apache.spark.sql.types.{
BinaryType,
IntegerType,
StringType,
StructField,
StructType
}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.{RangePartitioner, SerializableWritable, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.SparkContext
import org.archive.webservices.sparkling.io.{HdfsIO, IOUtil}
import org.archive.webservices.sparkling.util.{
IteratorUtil,
ManagedVal,
RddUtil,
ValueSupplier
}
import org.archive.webservices.sparkling.warc.{WarcLoader, WarcRecord}
import org.archive.webservices.sparkling.warc.WarcLoader

import scala.language.postfixOps
import scala.reflect.ClassTag
import scala.util.matching.Regex
import scala.util.Try

/**
* Package object which supplies implicits to augment generic RDDs with AUT-specific transformations.
Expand Down
3 changes: 0 additions & 3 deletions src/main/scala/io/archivesunleashed/udfs/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,8 @@ import io.archivesunleashed.matchbox.{
RemoveHTML,
RemoveHTTPHeader
}
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.SparkSession
import scala.util.matching.Regex

/** Package object providing UDFs for DataFrames in Scala and PySpark. * */
package object udfs extends Serializable {
Expand Down
2 changes: 1 addition & 1 deletion src/test/scala/io/archivesunleashed/RecordDFTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import io.archivesunleashed.udfs.{
}
import com.google.common.io.Resources
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
package io.archivesunleashed.df

import com.google.common.io.Resources
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package io.archivesunleashed

import com.google.common.io.Resources
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,7 @@
package io.archivesunleashed

import com.google.common.io.Resources
import io.archivesunleashed.udfs.{
extractDate,
extractDomain,
extractLinks,
removePrefixWWW
}
import io.archivesunleashed.udfs.{extractDate, extractLinks}
import org.apache.spark.sql.functions.{array, explode_outer, lower, udf}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package io.archivesunleashed

import com.google.common.io.Resources
import io.archivesunleashed.udfs.{extractDomain, extractLinks, removePrefixWWW}
import io.archivesunleashed.udfs.extractLinks
import org.apache.spark.sql.functions.{array, explode_outer, lower, udf}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package io.archivesunleashed

import com.google.common.io.Resources
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Loading

0 comments on commit 24bb5e5

Please sign in to comment.