Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scalafix and remove unused imports. #548

Merged
merged 1 commit into from
Nov 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,15 @@
<arg>-feature</arg>
<arg>-explaintypes</arg>
<arg>-target:jvm-1.8</arg>
<arg>-Ywarn-unused-import</arg>
</args>
<compilerPlugins>
<compilerPlugin>
<groupId>org.scalameta</groupId>
<artifactId>semanticdb-scalac_${scala.version}</artifactId>
<version>4.6.0</version>
</compilerPlugin>
</compilerPlugins>
</configuration>
</plugin>
<!-- For license header enforcement. -->
Expand Down Expand Up @@ -328,6 +336,11 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>io.github.evis</groupId>
<artifactId>scalafix-maven-plugin_${scala.binary.version}</artifactId>
<version>0.1.7_0.10.4</version>
</plugin>
</plugins>
</build>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,7 @@ import io.archivesunleashed.matchbox.ExtractDomain
import org.apache.tika.io.BoundedInputStream
import org.archive.webservices.sparkling.http.HttpMessage
import org.archive.webservices.sparkling.io.IOUtil
import org.archive.webservices.sparkling.util.{
ManagedVal,
RegexUtil,
ValueSupplier
}
import org.archive.webservices.sparkling.util.{ManagedVal, ValueSupplier}
import org.archive.webservices.sparkling.warc.{WarcHeaders, WarcRecord}
import scala.util.Try

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object AudioInformationExtractor {
Expand Down
4 changes: 2 additions & 2 deletions src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ package io.archivesunleashed.app
import java.io.File
import java.nio.file.{Files, Paths}

import io.archivesunleashed.{ArchiveRecord, RecordLoader}
import io.archivesunleashed.RecordLoader
import org.apache.log4j.Logger
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.{SparkConf, SparkContext}
import org.rogach.scallop.exceptions.ScallopException
import org.rogach.scallop.ScallopConf
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object CssInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,7 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import io.archivesunleashed.udfs.{extractDomain, removePrefixWWW}
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object DomainFrequencyExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import io.archivesunleashed.udfs.{extractDomain, removePrefixWWW}
import org.apache.spark.sql.functions.{desc, substring}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@
*/
package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.functions.{col, desc, first}
import org.apache.spark.sql.functions.{desc, first}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.{RangePartitioner, SparkContext}

/** Extract the most popular images from a DataFrame. */
object ExtractPopularImagesDF {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object HtmlInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object ImageGraphExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object ImageInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object JsInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object JsonInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object PDFInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.udfs.{extractBoilerpipeText}
import io.archivesunleashed.udfs.extractBoilerpipeText
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.functions.lower
import scala.language.postfixOps
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object PlainTextInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object PresentationProgramInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object SpreadsheetInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object VideoInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object WebGraphExtractor {
Expand Down
10 changes: 0 additions & 10 deletions src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.udfs.{
extractDomain,
removeHTML,
removeHTTPHeader,
removePrefixWWW
}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object WebPagesExtractor {
Expand All @@ -35,9 +28,6 @@ object WebPagesExtractor {
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
// scalastyle:off
import spark.implicits._
// scalastyle:on
d
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object WordProcessorInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object XmlInformationExtractor {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
package io.archivesunleashed.matchbox

import de.l3s.boilerpipe.extractors.DefaultExtractor
import java.io.IOException

/** Extract raw text content from an HTML page, minus "boilerplate" content (using boilerpipe). */
object ExtractBoilerpipeText {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
*/
package io.archivesunleashed.matchbox

import java.io.IOException
import org.jsoup.Jsoup
import org.jsoup.select.Elements
import scala.collection.mutable
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,7 @@
*/
package io.archivesunleashed.matchbox

import java.io.ByteArrayInputStream
import org.apache.tika.metadata.Metadata
import org.apache.tika.parser.ParseContext
import org.apache.tika.parser.pdf.PDFParser
import org.apache.tika.sax.BodyContentHandler;

/** Exacts texts from PDFs using Apache Tika. */
object ExtractTextFromPDFs {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
*/
package io.archivesunleashed.matchbox

import java.io.IOException
import org.jsoup.Jsoup

/** Removes HTML markup with JSoup. */
Expand Down
1 change: 0 additions & 1 deletion src/main/scala/io/archivesunleashed/matchbox/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
package io.archivesunleashed

import java.io.IOException
import java.security.MessageDigest
import scala.xml.Utility.escape

/** Package object which supplies implicits providing common UDF-related functionalities. */
Expand Down
9 changes: 3 additions & 6 deletions src/main/scala/io/archivesunleashed/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -35,36 +35,33 @@ import io.archivesunleashed.matchbox.{
}
import io.archivesunleashed.matchbox.ExtractDate.DateComponent
import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent
import java.net.URI
import java.net.URL

import org.apache.commons.codec.binary.Hex
import org.apache.commons.io.FilenameUtils
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions.{lit, lower, udf}
import org.apache.spark.sql.types.{
BinaryType,
IntegerType,
StringType,
StructField,
StructType
}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.{RangePartitioner, SerializableWritable, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.SparkContext
import org.archive.webservices.sparkling.io.{HdfsIO, IOUtil}
import org.archive.webservices.sparkling.util.{
IteratorUtil,
ManagedVal,
RddUtil,
ValueSupplier
}
import org.archive.webservices.sparkling.warc.{WarcLoader, WarcRecord}
import org.archive.webservices.sparkling.warc.WarcLoader

import scala.language.postfixOps
import scala.reflect.ClassTag
import scala.util.matching.Regex
import scala.util.Try

/**
* Package object which supplies implicits to augment generic RDDs with AUT-specific transformations.
Expand Down
3 changes: 0 additions & 3 deletions src/main/scala/io/archivesunleashed/udfs/package.scala
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,8 @@ import io.archivesunleashed.matchbox.{
RemoveHTML,
RemoveHTTPHeader
}
import org.apache.commons.lang3.StringUtils
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.SparkSession
import scala.util.matching.Regex

/** Package object providing UDFs for DataFrames in Scala and PySpark. * */
package object udfs extends Serializable {
Expand Down
2 changes: 1 addition & 1 deletion src/test/scala/io/archivesunleashed/RecordDFTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ import io.archivesunleashed.udfs.{
}
import com.google.common.io.Resources
import org.apache.spark.sql.functions.lit
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
package io.archivesunleashed.df

import com.google.common.io.Resources
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package io.archivesunleashed

import com.google.common.io.Resources
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,7 @@
package io.archivesunleashed

import com.google.common.io.Resources
import io.archivesunleashed.udfs.{
extractDate,
extractDomain,
extractLinks,
removePrefixWWW
}
import io.archivesunleashed.udfs.{extractDate, extractLinks}
import org.apache.spark.sql.functions.{array, explode_outer, lower, udf}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
package io.archivesunleashed

import com.google.common.io.Resources
import io.archivesunleashed.udfs.{extractDomain, extractLinks, removePrefixWWW}
import io.archivesunleashed.udfs.extractLinks
import org.apache.spark.sql.functions.{array, explode_outer, lower, udf}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package io.archivesunleashed

import com.google.common.io.Resources
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
Expand Down
Loading