From be3000a1fc30e1ad90610aece26793e753a464f7 Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Fri, 10 Jun 2022 12:19:25 -0400
Subject: [PATCH 01/20] [maven-release-plugin] prepare for next development
 iteration

---
 README.md | 4 ++--
 pom.xml   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/README.md b/README.md
index 155bf733..f5dd53a4 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # The Archives Unleashed Toolkit
 [![codecov](https://codecov.io/gh/archivesunleashed/aut/branch/main/graph/badge.svg)](https://codecov.io/gh/archivesunleashed/aut)
 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut/badge.svg)](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut)
-[![Scaladoc](https://img.shields.io/badge/Scaladoc-0.91.0-blue?style=flat)](https://api.docs.archivesunleashed.io/0.91.0/scaladocs/io/archivesunleashed/index.html)
-[![UserDocs](https://img.shields.io/badge/UserDocs-0.91.0-blue?style=flat)](https://aut.docs.archivesunleashed.org/docs/home)
+[![Scaladoc](https://img.shields.io/badge/Scaladoc-1.0.0-blue?style=flat)](https://api.docs.archivesunleashed.io/1.0.0/scaladocs/io/archivesunleashed/index.html)
+[![UserDocs](https://img.shields.io/badge/UserDocs-1.0.0-blue?style=flat)](https://aut.docs.archivesunleashed.org/docs/home)
 [![LICENSE](https://img.shields.io/badge/license-Apache-blue.svg?style=flat)](https://www.apache.org/licenses/LICENSE-2.0)
 [![Contribution Guidelines](http://img.shields.io/badge/CONTRIBUTING-Guidelines-blue.svg)](./CONTRIBUTING.md)
 
diff --git a/pom.xml b/pom.xml
index b6f95fd7..468d875a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
   <groupId>io.archivesunleashed</groupId>
   <artifactId>aut</artifactId>
   <packaging>jar</packaging>
-  <version>1.0.0</version>
+  <version>1.0.1-SNAPSHOT</version>
   <name>Archives Unleashed Toolkit</name>
   <description>An open-source toolkit for analyzing web archives.</description>
   <url>https://github.com/archivesunleashed/aut</url>
@@ -59,7 +59,7 @@
     <connection>scm:git:git@github.com:archivesunleashed/aut.git</connection>
     <developerConnection>scm:git:git@github.com:archivesunleashed/aut.git</developerConnection>
     <url>git@github.com:archivesunleashed/aut.git</url>
-    <tag>aut-1.0.0</tag>
+    <tag>HEAD</tag>
   </scm>
 
   <repositories>

From 2b8b717a3215a1de8c0aa14a110ba3006ff99e25 Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Sat, 11 Jun 2022 13:11:09 -0400
Subject: [PATCH 02/20] Update CHANGELOG for 1.0.0 release.

---
 CHANGELOG.md | 48 ++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e23c3b56..863c2a22 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,45 @@
 # Changelog
 
+## [aut-1.0.0](https://github.com/archivesunleashed/aut/tree/aut-1.0.0) (2022-06-10)
+
+[Full Changelog](https://github.com/archivesunleashed/aut/compare/aut-0.91.0...aut-1.0.0)
+
+**Implemented enhancements:**
+
+- Remove http headers, and html on webpages\(\) [\#538](https://github.com/archivesunleashed/aut/issues/538)
+- Add domain column to webpages\(\) [\#534](https://github.com/archivesunleashed/aut/issues/534)
+- Replace Java ARC/WARC record processing library [\#494](https://github.com/archivesunleashed/aut/issues/494)
+- Method to perform finer-grained selection of ARCs and WARCs [\#247](https://github.com/archivesunleashed/aut/issues/247)
+- Unnecessary buffer copying [\#18](https://github.com/archivesunleashed/aut/issues/18)
+
+**Fixed bugs:**
+
+- Discard date RDD filter only takes a single string, not a list of strings. [\#532](https://github.com/archivesunleashed/aut/issues/532)
+- Extract gzip data from transfer-encoded WARC [\#493](https://github.com/archivesunleashed/aut/issues/493)
+- ARC reader string vs int error on record length [\#492](https://github.com/archivesunleashed/aut/issues/492)
+
+**Closed issues:**
+
+- java.lang.RuntimeException: Unsupported literal type class scala.collection.immutable.Set$Set1 Set\(liberal.ca\) [\#529](https://github.com/archivesunleashed/aut/issues/529)
+- Improve CommandLineApp.scala test coverage [\#262](https://github.com/archivesunleashed/aut/issues/262)
+- Improve ExtractBoilerpipeText.scala test coverage [\#261](https://github.com/archivesunleashed/aut/issues/261)
+- Improve ArchiveRecord.scala test coverage [\#260](https://github.com/archivesunleashed/aut/issues/260)
+- Unit testing for RecordLoader [\#182](https://github.com/archivesunleashed/aut/issues/182)
+- Improve ArchiveRecordWritable.java test coverage [\#76](https://github.com/archivesunleashed/aut/issues/76)
+- Improve WarcRecordUtils.java test coverage [\#74](https://github.com/archivesunleashed/aut/issues/74)
+- Improve ArcRecordUtils.java test coverage [\#73](https://github.com/archivesunleashed/aut/issues/73)
+- Improve ExtractDate.scala test coverage [\#64](https://github.com/archivesunleashed/aut/issues/64)
+- Remove org.apache.commons.httpclient [\#23](https://github.com/archivesunleashed/aut/issues/23)
+
+**Merged pull requests:**
+
+- Make webpages\(\) consistent across aut and ARCH. [\#539](https://github.com/archivesunleashed/aut/pull/539) ([ruebot](https://github.com/ruebot))
+- Update README [\#537](https://github.com/archivesunleashed/aut/pull/537) ([ruebot](https://github.com/ruebot))
+- Fix codecov GitHub action. [\#536](https://github.com/archivesunleashed/aut/pull/536) ([ruebot](https://github.com/ruebot))
+- Bump commons-compress from 1.14 to 1.21 [\#535](https://github.com/archivesunleashed/aut/pull/535) ([dependabot[bot]](https://github.com/apps/dependabot))
+- Remove Java w/arc processing, and replace it with Sparkling. [\#533](https://github.com/archivesunleashed/aut/pull/533) ([ruebot](https://github.com/ruebot))
+- Bump xercesImpl from 2.12.0 to 2.12.2 [\#527](https://github.com/archivesunleashed/aut/pull/527) ([dependabot[bot]](https://github.com/apps/dependabot))
+
 ## [aut-0.91.0](https://github.com/archivesunleashed/aut/tree/aut-0.91.0) (2022-01-21)
 
 [Full Changelog](https://github.com/archivesunleashed/aut/compare/aut-0.90.4...aut-0.91.0)
@@ -462,13 +502,13 @@
 - Add ExtractGraphX including algorithms for PageRank and Components. Issue 203 [\#245](https://github.com/archivesunleashed/aut/pull/245) ([greebie](https://github.com/greebie))
 - Travis build fixes [\#244](https://github.com/archivesunleashed/aut/pull/244) ([ruebot](https://github.com/ruebot))
 - Data frame implementation of extractors. Also added cmd arguments to resolve \#235 [\#236](https://github.com/archivesunleashed/aut/pull/236) ([TitusAn](https://github.com/TitusAn))
-- Save images from dataframe to disk [\#234](https://github.com/archivesunleashed/aut/pull/234) ([JWZ2018](https://github.com/JWZ2018))
+- Save images from dataframe to disk [\#234](https://github.com/archivesunleashed/aut/pull/234) ([jwli229](https://github.com/jwli229))
 - Add missing dependencies in; addresses \#227. [\#233](https://github.com/archivesunleashed/aut/pull/233) ([ruebot](https://github.com/ruebot))
 - Code cleanup: ArchiveRecord + impl moved into same Scala file [\#230](https://github.com/archivesunleashed/aut/pull/230) ([lintool](https://github.com/lintool))
-- Add Extract Image Details API [\#226](https://github.com/archivesunleashed/aut/pull/226) ([JWZ2018](https://github.com/JWZ2018))
+- Add Extract Image Details API [\#226](https://github.com/archivesunleashed/aut/pull/226) ([jwli229](https://github.com/jwli229))
 - Implement DomainFrequency, DomainGraph and PlainText extractor that can be run from command line [\#225](https://github.com/archivesunleashed/aut/pull/225) ([TitusAn](https://github.com/TitusAn))
-- Remove duplicate call of keepValidPages [\#224](https://github.com/archivesunleashed/aut/pull/224) ([JWZ2018](https://github.com/JWZ2018))
-- Extract Image Links DF API + Test [\#221](https://github.com/archivesunleashed/aut/pull/221) ([JWZ2018](https://github.com/JWZ2018))
+- Remove duplicate call of keepValidPages [\#224](https://github.com/archivesunleashed/aut/pull/224) ([jwli229](https://github.com/jwli229))
+- Extract Image Links DF API + Test [\#221](https://github.com/archivesunleashed/aut/pull/221) ([jwli229](https://github.com/jwli229))
 - Update Apache Spark to 2.3.0; resolves \#218 [\#219](https://github.com/archivesunleashed/aut/pull/219) ([ruebot](https://github.com/ruebot))
 - Resolve https://github.com/archivesunleashed/docker-aut/issues/17 [\#217](https://github.com/archivesunleashed/aut/pull/217) ([ruebot](https://github.com/ruebot))
 - Create issue templates [\#216](https://github.com/archivesunleashed/aut/pull/216) ([ruebot](https://github.com/ruebot))

From 817285537078099e07a72246da2fbfd24fd8568b Mon Sep 17 00:00:00 2001
From: Nick Ruest <ruestn@gmail.com>
Date: Fri, 17 Jun 2022 10:43:28 -0400
Subject: [PATCH 03/20] Add ARCH text files derivatives. (#541)

- Add css, html, js, json, plain text, and xml information extraction
  methods
- Add app extractors
- Add Python implementation of extractors
- Add tests
- Resolves #540
- Fix for MIME types with extra data.
- Resolves #542
- Update Tika to 1.23
---
 pom.xml                                       |   2 +-
 src/main/python/aut/common.py                 |  18 +
 .../SparklingArchiveRecord.scala              |   5 +-
 .../app/CommandLineApp.scala                  | 102 ++++++
 .../app/CssInformationExtractor.scala         |  35 ++
 .../app/HtmlInformationExtractor.scala        |  35 ++
 .../app/JsInformationExtractor.scala          |  35 ++
 .../app/JsonInformationExtractor.scala        |  35 ++
 .../app/PlainTextInformationExtractor.scala   |  35 ++
 .../app/WebPagesExtractor.scala               |  10 +-
 .../app/XmlInformationExtractor.scala         |  35 ++
 .../df/DataFrameLoader.scala                  |  42 +++
 .../matchbox/DetectMimeTypeTika.scala         |  17 +-
 .../scala/io/archivesunleashed/package.scala  | 277 +++++++++++++++
 .../app/CommandLineAppTest.scala              | 324 ++++++++++++++++++
 .../app/CssInformationExtractorTest.scala     |  64 ++++
 .../app/HtmlInformationExtractorTest.scala    |  62 ++++
 .../app/JsInformationExtractorTest.scala      |  62 ++++
 .../app/JsonInfromationExtractor.scala        |  66 ++++
 .../app/PlainTextInformationExtractor.scala   |  62 ++++
 .../app/XmlInfromationExtractor.scala         |  65 ++++
 .../df/DataFrameLoaderTest.scala              |  46 +++
 22 files changed, 1418 insertions(+), 16 deletions(-)
 create mode 100644 src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala
 create mode 100644 src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala
 create mode 100644 src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala
 create mode 100644 src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala
 create mode 100644 src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
 create mode 100644 src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala
 create mode 100644 src/test/scala/io/archivesunleashed/app/CssInformationExtractorTest.scala
 create mode 100644 src/test/scala/io/archivesunleashed/app/HtmlInformationExtractorTest.scala
 create mode 100644 src/test/scala/io/archivesunleashed/app/JsInformationExtractorTest.scala
 create mode 100644 src/test/scala/io/archivesunleashed/app/JsonInfromationExtractor.scala
 create mode 100644 src/test/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
 create mode 100644 src/test/scala/io/archivesunleashed/app/XmlInfromationExtractor.scala

diff --git a/pom.xml b/pom.xml
index 468d875a..00fc0977 100644
--- a/pom.xml
+++ b/pom.xml
@@ -42,7 +42,7 @@
     <surefire.plugin.version>2.22.0</surefire.plugin.version>
     <jacoco.plugin.version>0.8.4</jacoco.plugin.version>
     <versions.plugin.version>2.1</versions.plugin.version>
-    <tika.version>1.22</tika.version>
+    <tika.version>1.23</tika.version>
     <jackson.version>2.10.0</jackson.version>
     <scala.maven.plugin.version>4.5.4</scala.maven.plugin.version>
   </properties>
diff --git a/src/main/python/aut/common.py b/src/main/python/aut/common.py
index 3529afde..f558e39e 100644
--- a/src/main/python/aut/common.py
+++ b/src/main/python/aut/common.py
@@ -14,15 +14,30 @@ def all(self):
     def audio(self):
         return DataFrame(self.loader.audio(self.path), self.sqlContext)
 
+    def css(self):
+        return DataFrame(self.loader.css(self.path), self.sqlContext)
+
+    def html(self):
+        return DataFrame(self.loader.html(self.path), self.sqlContext)
+
     def imagegraph(self):
         return DataFrame(self.loader.imagegraph(self.path), self.sqlContext)
 
     def images(self):
         return DataFrame(self.loader.images(self.path), self.sqlContext)
 
+    def js(self):
+        return DataFrame(self.loader.js(self.path), self.sqlContext)
+
+    def json(self):
+        return DataFrame(self.loader.json(self.path), self.sqlContext)
+
     def pdfs(self):
         return DataFrame(self.loader.pdfs(self.path), self.sqlContext)
 
+    def plain_text(self):
+        return DataFrame(self.loader.plainText(self.path), self.sqlContext)
+
     def presentation_program(self):
         return DataFrame(
             self.loader.presentationProgramFiles(self.path), self.sqlContext
@@ -42,3 +57,6 @@ def webpages(self):
 
     def word_processor(self):
         return DataFrame(self.loader.wordProcessorFiles(self.path), self.sqlContext)
+
+    def xml(self):
+        return DataFrame(self.loader.xml(self.path), self.sqlContext)
diff --git a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala
index adbce764..c96b09bd 100644
--- a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala
+++ b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala
@@ -81,7 +81,10 @@ class SparklingArchiveRecord(
     }.getOrElse("")
 
   override def getMimeType: String =
-    http(warc).flatMap(_.mime).getOrElse("unknown")
+    http(warc)
+      .flatMap(_.mime)
+      .getOrElse("unknown")
+      .replaceAll(" .*|\\s|\\n", "")
 
   override def getUrl: String = warc.url.getOrElse("").replaceAll("<|>", "")
 
diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
index a64b9676..c1fc05bb 100644
--- a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
+++ b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
@@ -313,6 +313,108 @@ class CommandLineApp(conf: CmdAppConf) {
         } else {
           saveCsv(WordProcessorInformationExtractor(df))
         }
+      }),
+    "CssInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .css()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).css()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(CssInformationExtractor(df))
+        } else {
+          saveCsv(CssInformationExtractor(df))
+        }
+      }),
+    "HtmlInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .html()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).html()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(HtmlInformationExtractor(df))
+        } else {
+          saveCsv(HtmlInformationExtractor(df))
+        }
+      }),
+    "JsInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .js()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).js()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(JsInformationExtractor(df))
+        } else {
+          saveCsv(JsInformationExtractor(df))
+        }
+      }),
+    "JsonInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .json()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).json()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(JsonInformationExtractor(df))
+        } else {
+          saveCsv(JsonInformationExtractor(df))
+        }
+      }),
+    "PlainTextInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .plainText()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).plainText()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(PlainTextInformationExtractor(df))
+        } else {
+          saveCsv(PlainTextInformationExtractor(df))
+        }
+      }),
+    "XmlInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .xml()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).xml()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(XmlInformationExtractor(df))
+        } else {
+          saveCsv(XmlInformationExtractor(df))
+        }
       })
   )
 
diff --git a/src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala
new file mode 100644
index 00000000..6d592428
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object CssInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala
new file mode 100644
index 00000000..7c43daa8
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object HtmlInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala
new file mode 100644
index 00000000..0e976239
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object JsInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala
new file mode 100644
index 00000000..998a55be
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object JsonInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
new file mode 100644
index 00000000..f4495204
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object PlainTextInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala b/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
index 113efff9..216c93ba 100644
--- a/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
@@ -38,14 +38,6 @@ object WebPagesExtractor {
     // scalastyle:off
     import spark.implicits._
     // scalastyle:on
-    d.select(
-      $"crawl_date",
-      $"domain",
-      $"url",
-      $"mime_type_web_server",
-      $"mime_type_tika",
-      $"language",
-      $"content"
-    )
+    d
   }
 }
diff --git a/src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala
new file mode 100644
index 00000000..79ad75a2
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object XmlInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala b/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala
index 384cf280..f051fe25 100644
--- a/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala
+++ b/src/main/scala/io/archivesunleashed/df/DataFrameLoader.scala
@@ -37,6 +37,20 @@ class DataFrameLoader(sc: SparkContext) {
       .audio()
   }
 
+  /* Create a DataFrame with css url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and content. */
+  def css(path: String): DataFrame = {
+    RecordLoader
+      .loadArchives(path, sc)
+      .css()
+  }
+
+  /* Create a DataFrame with html url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and content. */
+  def html(path: String): DataFrame = {
+    RecordLoader
+      .loadArchives(path, sc)
+      .html()
+  }
+
   /* Create a DataFrame with crawl date, source page, image url, and alt text. */
   def imagegraph(path: String): DataFrame = {
     RecordLoader
@@ -51,6 +65,20 @@ class DataFrameLoader(sc: SparkContext) {
       .images()
   }
 
+  /* Create a DataFrame with js url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and content. */
+  def js(path: String): DataFrame = {
+    RecordLoader
+      .loadArchives(path, sc)
+      .js()
+  }
+
+  /* Create a DataFrame with json url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and content. */
+  def json(path: String): DataFrame = {
+    RecordLoader
+      .loadArchives(path, sc)
+      .json()
+  }
+
   /** Create a DataFrame with PDF url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
   def pdfs(path: String): DataFrame = {
     RecordLoader
@@ -58,6 +86,13 @@ class DataFrameLoader(sc: SparkContext) {
       .pdfs()
   }
 
+  /* Create a DataFrame with plain-text url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and content. */
+  def plainText(path: String): DataFrame = {
+    RecordLoader
+      .loadArchives(path, sc)
+      .plainText()
+  }
+
   /** Create a DataFrame with presentation program file url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and raw bytes. */
   def presentationProgramFiles(path: String): DataFrame = {
     RecordLoader
@@ -99,4 +134,11 @@ class DataFrameLoader(sc: SparkContext) {
       .loadArchives(path, sc)
       .wordProcessorFiles()
   }
+
+  /* Create a DataFrame with xml url, filename, extension, mime_type_web_server, mime_type_tika, md5, sha1, and content. */
+  def xml(path: String): DataFrame = {
+    RecordLoader
+      .loadArchives(path, sc)
+      .xml()
+  }
 }
diff --git a/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala b/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala
index 06c47365..dc2c4f57 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/DetectMimeTypeTika.scala
@@ -54,8 +54,12 @@ object DetectMimeTypeTika {
     * @return file extension (e.g. ".jpg" for "image/jpeg").
     */
   def getExtension(mimeType: String): String = {
-    val regMimeType = allMimeTypes.forName(mimeType)
-    regMimeType.getExtension
+    try {
+      val regMimeType = allMimeTypes.forName(mimeType)
+      regMimeType.getExtension
+    } catch {
+      case e: Throwable => ""
+    }
   }
 
   /** Return the list of all known file extensions for a MIME type string
@@ -64,8 +68,11 @@ object DetectMimeTypeTika {
     * @return list of file extensions (e.g. ".jpg" for "image/jpeg").
     */
   def getExtensions(mimeType: String): List[String] = {
-    val regMimeType = allMimeTypes.forName(mimeType)
-    regMimeType.getExtensions.asScala.toList
+    try {
+      val regMimeType = allMimeTypes.forName(mimeType)
+      regMimeType.getExtensions.asScala.toList
+    } catch {
+      case e: Throwable => Nil
+    }
   }
-
 }
diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala
index 0dff6961..fec08399 100644
--- a/src/main/scala/io/archivesunleashed/package.scala
+++ b/src/main/scala/io/archivesunleashed/package.scala
@@ -694,6 +694,283 @@ package object archivesunleashed {
       sqlContext.getOrCreate().createDataFrame(records, schema)
     }
 
+    /* Extract css. */
+    def css(): DataFrame = {
+      val records = rdd
+        .map(r => (r, (r.getMimeType)))
+        .filter(r => r._2 == "text/css")
+        .map(r => {
+          val bytes = r._1.getBinaryBytes
+          val md5Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+          )
+          val sha1Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+          )
+          val encodedBytes = Base64.getEncoder.encodeToString(bytes)
+          val url = new URL(r._1.getUrl)
+          val filename = FilenameUtils.getName(url.getPath())
+          val extension = GetExtensionMIME(url.getPath(), r._2)
+          (
+            r._1.getCrawlDate,
+            r._1.getUrl,
+            filename,
+            extension,
+            r._1.getMimeType,
+            DetectMimeTypeTika(r._1.getBinaryBytes),
+            md5Hash,
+            sha1Hash,
+            RemoveHTTPHeader(r._1.getContentString)
+          )
+        })
+        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+
+      val schema = new StructType()
+        .add(StructField("crawl_date", StringType, true))
+        .add(StructField("url", StringType, true))
+        .add(StructField("filename", StringType, true))
+        .add(StructField("extension", StringType, true))
+        .add(StructField("mime_type_web_server", StringType, true))
+        .add(StructField("mime_type_tika", StringType, true))
+        .add(StructField("md5", StringType, true))
+        .add(StructField("sha1", StringType, true))
+        .add(StructField("content", StringType, true))
+
+      val sqlContext = SparkSession.builder();
+      sqlContext.getOrCreate().createDataFrame(records, schema)
+    }
+
+    /* Extract html. */
+    def html(): DataFrame = {
+      val records = rdd
+        .map(r => (r, (r.getMimeType)))
+        .filter(r => r._2 == "text/html")
+        .map(r => {
+          val bytes = r._1.getBinaryBytes
+          val md5Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+          )
+          val sha1Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+          )
+          val encodedBytes = Base64.getEncoder.encodeToString(bytes)
+          val url = new URL(r._1.getUrl)
+          val filename = FilenameUtils.getName(url.getPath())
+          val extension = GetExtensionMIME(url.getPath(), r._2)
+          (
+            r._1.getCrawlDate,
+            r._1.getUrl,
+            filename,
+            extension,
+            r._1.getMimeType,
+            DetectMimeTypeTika(r._1.getBinaryBytes),
+            md5Hash,
+            sha1Hash,
+            RemoveHTTPHeader(r._1.getContentString)
+          )
+        })
+        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+
+      val schema = new StructType()
+        .add(StructField("crawl_date", StringType, true))
+        .add(StructField("url", StringType, true))
+        .add(StructField("filename", StringType, true))
+        .add(StructField("extension", StringType, true))
+        .add(StructField("mime_type_web_server", StringType, true))
+        .add(StructField("mime_type_tika", StringType, true))
+        .add(StructField("md5", StringType, true))
+        .add(StructField("sha1", StringType, true))
+        .add(StructField("content", StringType, true))
+
+      val sqlContext = SparkSession.builder();
+      sqlContext.getOrCreate().createDataFrame(records, schema)
+    }
+
+    /* Extract javascript. */
+    def js(): DataFrame = {
+      val records = rdd
+        .map(r => (r, (r.getMimeType)))
+        .filter(r => r._2.contains("javascript"))
+        .map(r => {
+          val bytes = r._1.getBinaryBytes
+          val md5Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+          )
+          val sha1Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+          )
+          val encodedBytes = Base64.getEncoder.encodeToString(bytes)
+          val url = new URL(r._1.getUrl)
+          val filename = FilenameUtils.getName(url.getPath())
+          val extension = GetExtensionMIME(url.getPath(), r._2)
+          (
+            r._1.getCrawlDate,
+            r._1.getUrl,
+            filename,
+            extension,
+            r._1.getMimeType,
+            DetectMimeTypeTika(r._1.getBinaryBytes),
+            md5Hash,
+            sha1Hash,
+            RemoveHTTPHeader(r._1.getContentString)
+          )
+        })
+        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+
+      val schema = new StructType()
+        .add(StructField("crawl_date", StringType, true))
+        .add(StructField("url", StringType, true))
+        .add(StructField("filename", StringType, true))
+        .add(StructField("extension", StringType, true))
+        .add(StructField("mime_type_web_server", StringType, true))
+        .add(StructField("mime_type_tika", StringType, true))
+        .add(StructField("md5", StringType, true))
+        .add(StructField("sha1", StringType, true))
+        .add(StructField("content", StringType, true))
+
+      val sqlContext = SparkSession.builder();
+      sqlContext.getOrCreate().createDataFrame(records, schema)
+    }
+
+    /* Extract json. */
+    def json(): DataFrame = {
+      val records = rdd
+        .map(r => (r, (r.getMimeType)))
+        .filter(r => r._2.contains("json"))
+        .map(r => {
+          val bytes = r._1.getBinaryBytes
+          val md5Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+          )
+          val sha1Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+          )
+          val encodedBytes = Base64.getEncoder.encodeToString(bytes)
+          val url = new URL(r._1.getUrl)
+          val filename = FilenameUtils.getName(url.getPath())
+          val extension = GetExtensionMIME(url.getPath(), r._2)
+          (
+            r._1.getCrawlDate,
+            r._1.getUrl,
+            filename,
+            extension,
+            r._1.getMimeType,
+            DetectMimeTypeTika(r._1.getBinaryBytes),
+            md5Hash,
+            sha1Hash,
+            RemoveHTTPHeader(r._1.getContentString)
+          )
+        })
+        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+
+      val schema = new StructType()
+        .add(StructField("crawl_date", StringType, true))
+        .add(StructField("url", StringType, true))
+        .add(StructField("filename", StringType, true))
+        .add(StructField("extension", StringType, true))
+        .add(StructField("mime_type_web_server", StringType, true))
+        .add(StructField("mime_type_tika", StringType, true))
+        .add(StructField("md5", StringType, true))
+        .add(StructField("sha1", StringType, true))
+        .add(StructField("content", StringType, true))
+
+      val sqlContext = SparkSession.builder();
+      sqlContext.getOrCreate().createDataFrame(records, schema)
+    }
+
+    /* Extract plain text. */
+    def plainText(): DataFrame = {
+      val records = rdd
+        .map(r => (r, (r.getMimeType), (DetectMimeTypeTika(r.getBinaryBytes))))
+        .filter(r => r._2 == "text/plain")
+        .filter(r => r._3 == "text/plain")
+        .map(r => {
+          val bytes = r._1.getBinaryBytes
+          val md5Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+          )
+          val sha1Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+          )
+          val encodedBytes = Base64.getEncoder.encodeToString(bytes)
+          val url = new URL(r._1.getUrl)
+          val filename = FilenameUtils.getName(url.getPath())
+          val extension = GetExtensionMIME(url.getPath(), r._2)
+          (
+            r._1.getCrawlDate,
+            r._1.getUrl,
+            filename,
+            extension,
+            r._1.getMimeType,
+            DetectMimeTypeTika(r._1.getBinaryBytes),
+            md5Hash,
+            sha1Hash,
+            RemoveHTTPHeader(r._1.getContentString)
+          )
+        })
+        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+
+      val schema = new StructType()
+        .add(StructField("crawl_date", StringType, true))
+        .add(StructField("url", StringType, true))
+        .add(StructField("filename", StringType, true))
+        .add(StructField("extension", StringType, true))
+        .add(StructField("mime_type_web_server", StringType, true))
+        .add(StructField("mime_type_tika", StringType, true))
+        .add(StructField("md5", StringType, true))
+        .add(StructField("sha1", StringType, true))
+        .add(StructField("content", StringType, true))
+
+      val sqlContext = SparkSession.builder();
+      sqlContext.getOrCreate().createDataFrame(records, schema)
+    }
+
+    /* Extract xml. */
+    def xml(): DataFrame = {
+      val records = rdd
+        .map(r => (r, (r.getMimeType)))
+        .filter(r => r._2.contains("xml"))
+        .map(r => {
+          val bytes = r._1.getBinaryBytes
+          val md5Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("MD5").digest(bytes))
+          )
+          val sha1Hash = new String(
+            Hex.encodeHex(MessageDigest.getInstance("SHA1").digest(bytes))
+          )
+          val encodedBytes = Base64.getEncoder.encodeToString(bytes)
+          val url = new URL(r._1.getUrl)
+          val filename = FilenameUtils.getName(url.getPath())
+          val extension = GetExtensionMIME(url.getPath(), r._2)
+          (
+            r._1.getCrawlDate,
+            r._1.getUrl,
+            filename,
+            extension,
+            r._1.getMimeType,
+            DetectMimeTypeTika(r._1.getBinaryBytes),
+            md5Hash,
+            sha1Hash,
+            RemoveHTTPHeader(r._1.getContentString)
+          )
+        })
+        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+
+      val schema = new StructType()
+        .add(StructField("crawl_date", StringType, true))
+        .add(StructField("url", StringType, true))
+        .add(StructField("filename", StringType, true))
+        .add(StructField("extension", StringType, true))
+        .add(StructField("mime_type_web_server", StringType, true))
+        .add(StructField("mime_type_tika", StringType, true))
+        .add(StructField("md5", StringType, true))
+        .add(StructField("sha1", StringType, true))
+        .add(StructField("content", StringType, true))
+
+      val sqlContext = SparkSession.builder();
+      sqlContext.getOrCreate().createDataFrame(records, schema)
+    }
+
     /** Removes all data except images. */
     def keepImages(): RDD[ArchiveRecord] = {
       rdd.filter(r =>
diff --git a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala
index 2e6fba90..38629c75 100644
--- a/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/CommandLineAppTest.scala
@@ -743,6 +743,330 @@ class CommandLineAppTest extends FunSuite with BeforeAndAfter {
       "parquet",
       "--partition",
       "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "CssInformationExtractor"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "CssInformationExtractor",
+      "--split"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "CssInformationExtractor",
+      "--partition",
+      "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "CssInformationExtractor",
+      "--output-format",
+      "parquet"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "CssInformationExtractor",
+      "--output-format",
+      "parquet",
+      "--partition",
+      "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "HtmlInformationExtractor"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "HtmlInformationExtractor",
+      "--split"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "HtmlInformationExtractor",
+      "--partition",
+      "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "HtmlInformationExtractor",
+      "--output-format",
+      "parquet"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "HtmlInformationExtractor",
+      "--output-format",
+      "parquet",
+      "--partition",
+      "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "JsInformationExtractor"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "JsInformationExtractor",
+      "--split"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "JsInformationExtractor",
+      "--partition",
+      "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "JsInformationExtractor",
+      "--output-format",
+      "parquet"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "JsInformationExtractor",
+      "--output-format",
+      "parquet",
+      "--partition",
+      "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "JsonInformationExtractor"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "JsonInformationExtractor",
+      "--split"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "JsonInformationExtractor",
+      "--partition",
+      "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "JsonInformationExtractor",
+      "--output-format",
+      "parquet"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "JsonInformationExtractor",
+      "--output-format",
+      "parquet",
+      "--partition",
+      "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "PlainTextInformationExtractor"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "PlainTextInformationExtractor",
+      "--split"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "PlainTextInformationExtractor",
+      "--partition",
+      "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "PlainTextInformationExtractor",
+      "--output-format",
+      "parquet"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "PlainTextInformationExtractor",
+      "--output-format",
+      "parquet",
+      "--partition",
+      "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "XmlInformationExtractor"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "XmlInformationExtractor",
+      "--split"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "XmlInformationExtractor",
+      "--partition",
+      "1"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "XmlInformationExtractor",
+      "--output-format",
+      "parquet"
+    ),
+    Array(
+      inputOpt,
+      arcPath,
+      warcPath,
+      outputOpt,
+      outputDir,
+      extractOpt,
+      "XmlInformationExtractor",
+      "--output-format",
+      "parquet",
+      "--partition",
+      "1"
     )
   )
 
diff --git a/src/test/scala/io/archivesunleashed/app/CssInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/CssInformationExtractorTest.scala
new file mode 100644
index 00000000..1c0caee2
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/CssInformationExtractorTest.scala
@@ -0,0 +1,64 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class CssInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("CSS information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).css()
+    val dfResults = CssInformationExtractor(df).collect()
+    val RESULTSLENGTH = 4
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "20080430204833")
+    assert(
+      dfResults(0).get(1) == "http://www.archive.org/stylesheets/details.css"
+    )
+    assert(dfResults(0).get(2) == "details.css")
+    assert(dfResults(0).get(3) == "css")
+    assert(dfResults(0).get(4) == "text/css")
+    assert(dfResults(0).get(5) == "text/plain")
+    assert(dfResults(0).get(6) == "f675020391de85d915a5ec65eb52e1c9")
+    assert(dfResults(0).get(7) == "2961a59b8fc20f401e1927dd0b63e5ae6e833f7a")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/HtmlInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/HtmlInformationExtractorTest.scala
new file mode 100644
index 00000000..a0810fc0
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/HtmlInformationExtractorTest.scala
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class HtmlInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("HTML information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).html()
+    val dfResults = HtmlInformationExtractor(df).collect()
+    val RESULTSLENGTH = 140
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "20080430204826")
+    assert(dfResults(0).get(1) == "http://www.archive.org/")
+    assert(dfResults(0).get(2) == "")
+    assert(dfResults(0).get(3) == "html")
+    assert(dfResults(0).get(4) == "text/html")
+    assert(dfResults(0).get(5) == "text/html")
+    assert(dfResults(0).get(6) == "990fc5f1674fd21b9a035cf9193c3f10")
+    assert(dfResults(0).get(7) == "d5817bf5b4b35a296823509dd754700a6ad522b5")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/JsInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/JsInformationExtractorTest.scala
new file mode 100644
index 00000000..cf72804c
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/JsInformationExtractorTest.scala
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class JsInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("JS information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).js()
+    val dfResults = JsInformationExtractor(df).collect()
+    val RESULTSLENGTH = 8
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "20080430204833")
+    assert(dfResults(0).get(1) == "http://www.archive.org/flv/flv.js?v=1.34")
+    assert(dfResults(0).get(2) == "flv.js")
+    assert(dfResults(0).get(3) == "js")
+    assert(dfResults(0).get(4) == "application/x-javascript")
+    assert(dfResults(0).get(5) == "text/plain")
+    assert(dfResults(0).get(6) == "8c73985a47e0d3720765d92fbde8cc9f")
+    assert(dfResults(0).get(7) == "83a0951127abb1da11b141ad22ac72c20f2b4804")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/JsonInfromationExtractor.scala b/src/test/scala/io/archivesunleashed/app/JsonInfromationExtractor.scala
new file mode 100644
index 00000000..5db998c4
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/JsonInfromationExtractor.scala
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class JsonInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath =
+    Resources.getResource("warc/example.pdf.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("JSON information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).json()
+    val dfResults = JsonInformationExtractor(df).collect()
+    val RESULTSLENGTH = 3
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "20190812222538")
+    assert(
+      dfResults(0)
+        .get(1) == "https://api.plu.mx/widget/other/artifact?type=doi&id=10.1109%2FJCDL.2019.00043&href=https%3A%2F%2Fplu.mx%2Fpitt%2Fa%2F%3Fdoi%3D10.1109%2FJCDL.2019.00043&ref=https%3A%2F%2Fyorkspace.library.yorku.ca%2Fxmlui%2Fhandle%2F10315%2F36158&pageToken=f74d46f3-f622-c670-e1bc-bdc3-aa500a283693&isElsWidget=false"
+    )
+    assert(dfResults(0).get(2) == "artifact")
+    assert(dfResults(0).get(3) == "json")
+    assert(dfResults(0).get(4) == "application/json")
+    assert(dfResults(0).get(5) == "N/A")
+    assert(dfResults(0).get(6) == "d41d8cd98f00b204e9800998ecf8427e")
+    assert(dfResults(0).get(7) == "da39a3ee5e6b4b0d3255bfef95601890afd80709")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala b/src/test/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
new file mode 100644
index 00000000..d1b62e01
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class PlainTextInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("Plain text information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).plainText()
+    val dfResults = PlainTextInformationExtractor(df).collect()
+    val RESULTSLENGTH = 34
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "20080430204825")
+    assert(dfResults(0).get(1) == "http://www.archive.org/robots.txt")
+    assert(dfResults(0).get(2) == "robots.txt")
+    assert(dfResults(0).get(3) == "txt")
+    assert(dfResults(0).get(4) == "text/plain")
+    assert(dfResults(0).get(5) == "text/plain")
+    assert(dfResults(0).get(6) == "a6d6869f680b1bdd0d27bf5a5f49482e")
+    assert(dfResults(0).get(7) == "95046652b71aaa1e8a5a6af91e24016dfeae7bd4")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/app/XmlInfromationExtractor.scala b/src/test/scala/io/archivesunleashed/app/XmlInfromationExtractor.scala
new file mode 100644
index 00000000..3662ccf6
--- /dev/null
+++ b/src/test/scala/io/archivesunleashed/app/XmlInfromationExtractor.scala
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import com.google.common.io.Resources
+import io.archivesunleashed.RecordLoader
+import org.apache.spark.{SparkConf, SparkContext}
+import org.junit.runner.RunWith
+import org.scalatest.junit.JUnitRunner
+import org.scalatest.{BeforeAndAfter, FunSuite}
+
+@RunWith(classOf[JUnitRunner])
+class XmlInformationExtractorTest extends FunSuite with BeforeAndAfter {
+  private val arcPath = Resources.getResource("warc/example.warc.gz").getPath
+  private var sc: SparkContext = _
+  private val master = "local[4]"
+  private val appName = "example-spark"
+
+  before {
+    val conf = new SparkConf()
+      .setMaster(master)
+      .setAppName(appName)
+    conf.set("spark.driver.allowMultipleContexts", "true")
+    sc = new SparkContext(conf)
+  }
+
+  test("XML information extractor DF") {
+    val df = RecordLoader.loadArchives(arcPath, sc).xml()
+    val dfResults = XmlInformationExtractor(df).collect()
+    val RESULTSLENGTH = 9
+
+    assert(dfResults.length == RESULTSLENGTH)
+    assert(dfResults(0).get(0) == "20080430204830")
+    assert(
+      dfResults(0)
+        .get(1) == "http://www.archive.org/services/collection-rss.php"
+    )
+    assert(dfResults(0).get(2) == "collection-rss.php")
+    assert(dfResults(0).get(3) == "xml")
+    assert(dfResults(0).get(4) == "text/xml")
+    assert(dfResults(0).get(5) == "application/rss+xml")
+    assert(dfResults(0).get(6) == "647a665e6acc2141af6d377b02e16c99")
+    assert(dfResults(0).get(7) == "4dee969d37e188ce705c6b99b8a6ca62aa1418e5")
+  }
+
+  after {
+    if (sc != null) {
+      sc.stop()
+    }
+  }
+}
diff --git a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
index d021006d..fb8f3de7 100644
--- a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
@@ -25,6 +25,7 @@ import org.scalatest.{BeforeAndAfter, FunSuite}
 @RunWith(classOf[JUnitRunner])
 class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
   private val arcPath = Resources.getResource("arc/example.arc.gz").getPath
+  private val warcPath = Resources.getResource("warc/example.warc.gz").getPath
   private val mediaPath =
     Resources.getResource("warc/example.media.warc.gz").getPath
   private val docPath =
@@ -40,6 +41,7 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
   private val url = "url"
   private val mime_type = "mime_type_web_server"
   private val md5 = "md5"
+  private val crawl_date = "crawl_date"
 
   before {
     val conf = new SparkConf()
@@ -61,6 +63,12 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
     val powerpoint = df.presentationProgramFiles(docPath)
     val word = df.wordProcessorFiles(docPath)
     val all = df.all(arcPath)
+    val css = df.css(warcPath)
+    val html = df.html(warcPath)
+    val js = df.js(warcPath)
+    val json = df.json(pdfPath)
+    val pt = df.plainText(warcPath)
+    val xml = df.xml(warcPath)
 
     val r_1 = validPages.select(domain, url, mime_type).take(1)(0)
     assert(r_1.getAs[String](domain) == "archive.org")
@@ -129,6 +137,44 @@ class DataFrameLoaderTest extends FunSuite with BeforeAndAfter {
       r_11.getAs[String](url) == "http://www.archive.org/robots.txt"
     )
     assert(r_11.getAs[String](mime_type) == "text/plain")
+
+    val r_12 = css.select(crawl_date, url).take(1)(0)
+    assert(r_12.getAs[String](crawl_date) == "20080430204833")
+    assert(
+      r_12
+        .getAs[String](url) == "http://www.archive.org/stylesheets/details.css"
+    )
+
+    val r_13 = html.select(crawl_date, url).take(1)(0)
+    assert(r_13.getAs[String](crawl_date) == "20080430204826")
+    assert(
+      r_13.getAs[String](url) == "http://www.archive.org/"
+    )
+
+    val r_14 = js.select(crawl_date, url).take(1)(0)
+    assert(r_14.getAs[String](crawl_date) == "20080430204833")
+    assert(
+      r_14.getAs[String](url) == "http://www.archive.org/flv/flv.js?v=1.34"
+    )
+
+    val r_15 = json.select(crawl_date, url).take(1)(0)
+    assert(r_15.getAs[String](crawl_date) == "20190812222538")
+    assert(
+      r_15.getAs[String](url) == "https://api.plu.mx/widget/other/artifact?type=doi&id=10.1109%2FJCDL.2019.00043&href=https%3A%2F%2Fplu.mx%2Fpitt%2Fa%2F%3Fdoi%3D10.1109%2FJCDL.2019.00043&ref=https%3A%2F%2Fyorkspace.library.yorku.ca%2Fxmlui%2Fhandle%2F10315%2F36158&pageToken=f74d46f3-f622-c670-e1bc-bdc3-aa500a283693&isElsWidget=false"
+    )
+
+    val r_16 = pt.select(crawl_date, url).take(1)(0)
+    assert(r_16.getAs[String](crawl_date) == "20080430204825")
+    assert(
+      r_16.getAs[String](url) == "http://www.archive.org/robots.txt"
+    )
+
+    val r_17 = xml.select(crawl_date, url).take(1)(0)
+    assert(r_17.getAs[String](crawl_date) == "20080430204830")
+    assert(
+      r_17.getAs[String](url) == "http://www.archive.org/services/collection-rss.php"
+    )
+
   }
 
   after {

From 1a221bbe9414062f7a5ad3a01cae990fe4b79203 Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Fri, 17 Jun 2022 10:50:48 -0400
Subject: [PATCH 04/20] [maven-release-plugin] prepare release aut-1.1.0

---
 pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index 00fc0977..33525b8a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
   <groupId>io.archivesunleashed</groupId>
   <artifactId>aut</artifactId>
   <packaging>jar</packaging>
-  <version>1.0.1-SNAPSHOT</version>
+  <version>1.1.0</version>
   <name>Archives Unleashed Toolkit</name>
   <description>An open-source toolkit for analyzing web archives.</description>
   <url>https://github.com/archivesunleashed/aut</url>
@@ -59,7 +59,7 @@
     <connection>scm:git:git@github.com:archivesunleashed/aut.git</connection>
     <developerConnection>scm:git:git@github.com:archivesunleashed/aut.git</developerConnection>
     <url>git@github.com:archivesunleashed/aut.git</url>
-    <tag>HEAD</tag>
+    <tag>aut-1.1.0</tag>
   </scm>
 
   <repositories>

From ba660da5311d2d73c0ddbcf0a6bee98c6a4f7f28 Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Fri, 17 Jun 2022 10:50:48 -0400
Subject: [PATCH 05/20] [maven-release-plugin] prepare for next development
 iteration

---
 README.md | 4 ++--
 pom.xml   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index f5dd53a4..080a00cd 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # The Archives Unleashed Toolkit
 [![codecov](https://codecov.io/gh/archivesunleashed/aut/branch/main/graph/badge.svg)](https://codecov.io/gh/archivesunleashed/aut)
 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut/badge.svg)](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut)
-[![Scaladoc](https://img.shields.io/badge/Scaladoc-1.0.0-blue?style=flat)](https://api.docs.archivesunleashed.io/1.0.0/scaladocs/io/archivesunleashed/index.html)
-[![UserDocs](https://img.shields.io/badge/UserDocs-1.0.0-blue?style=flat)](https://aut.docs.archivesunleashed.org/docs/home)
+[![Scaladoc](https://img.shields.io/badge/Scaladoc-1.1.0-blue?style=flat)](https://api.docs.archivesunleashed.io/1.1.0/scaladocs/io/archivesunleashed/index.html)
+[![UserDocs](https://img.shields.io/badge/UserDocs-1.1.0-blue?style=flat)](https://aut.docs.archivesunleashed.org/docs/home)
 [![LICENSE](https://img.shields.io/badge/license-Apache-blue.svg?style=flat)](https://www.apache.org/licenses/LICENSE-2.0)
 [![Contribution Guidelines](http://img.shields.io/badge/CONTRIBUTING-Guidelines-blue.svg)](./CONTRIBUTING.md)
 
diff --git a/pom.xml b/pom.xml
index 33525b8a..c0b697be 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
   <groupId>io.archivesunleashed</groupId>
   <artifactId>aut</artifactId>
   <packaging>jar</packaging>
-  <version>1.1.0</version>
+  <version>1.1.1-SNAPSHOT</version>
   <name>Archives Unleashed Toolkit</name>
   <description>An open-source toolkit for analyzing web archives.</description>
   <url>https://github.com/archivesunleashed/aut</url>
@@ -59,7 +59,7 @@
     <connection>scm:git:git@github.com:archivesunleashed/aut.git</connection>
     <developerConnection>scm:git:git@github.com:archivesunleashed/aut.git</developerConnection>
     <url>git@github.com:archivesunleashed/aut.git</url>
-    <tag>aut-1.1.0</tag>
+    <tag>HEAD</tag>
   </scm>
 
   <repositories>

From 0c5eb6ce03a7bfe36f175cd038f9ed6e55bd0e80 Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Fri, 17 Jun 2022 11:43:24 -0400
Subject: [PATCH 06/20] Update CHANGELOG for 1.1.0 release.

---
 CHANGELOG.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 863c2a22..8faea86f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,21 @@
 # Changelog
 
+## [aut-1.1.0](https://github.com/archivesunleashed/aut/tree/aut-1.1.0) (2022-06-17)
+
+[Full Changelog](https://github.com/archivesunleashed/aut/compare/aut-1.0.0...aut-1.1.0)
+
+**Fixed bugs:**
+
+- org.apache.tika.mime.MimeTypeException: Invalid media type name: application/rss+xml lang=utf-8 [\#542](https://github.com/archivesunleashed/aut/issues/542)
+
+**Closed issues:**
+
+- Add ARCH text files derivatives [\#540](https://github.com/archivesunleashed/aut/issues/540)
+
+**Merged pull requests:**
+
+- Add ARCH text files derivatives. [\#541](https://github.com/archivesunleashed/aut/pull/541) ([ruebot](https://github.com/ruebot))
+
 ## [aut-1.0.0](https://github.com/archivesunleashed/aut/tree/aut-1.0.0) (2022-06-10)
 
 [Full Changelog](https://github.com/archivesunleashed/aut/compare/aut-0.91.0...aut-1.0.0)

From 8a4bf5460969f9133db1e95cb3b4969df3025a59 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 28 Sep 2022 15:08:44 -0400
Subject: [PATCH 07/20] Bump jsoup from 1.14.2 to 1.15.3 (#543)

Bumps [jsoup](https://github.com/jhy/jsoup) from 1.14.2 to 1.15.3.
- [Release notes](https://github.com/jhy/jsoup/releases)
- [Changelog](https://github.com/jhy/jsoup/blob/master/CHANGES)
- [Commits](https://github.com/jhy/jsoup/compare/jsoup-1.14.2...jsoup-1.15.3)

---
updated-dependencies:
- dependency-name: org.jsoup:jsoup
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index c0b697be..dc77bc28 100644
--- a/pom.xml
+++ b/pom.xml
@@ -472,7 +472,7 @@
     <dependency>
       <groupId>org.jsoup</groupId>
       <artifactId>jsoup</artifactId>
-      <version>1.14.2</version>
+      <version>1.15.3</version>
     </dependency>
     <dependency>
       <groupId>org.netpreserve.commons</groupId>

From 86d926fa55d443f6e8204e45341492e982492058 Mon Sep 17 00:00:00 2001
From: Nick Ruest <ruestn@gmail.com>
Date: Mon, 31 Oct 2022 13:35:23 -0400
Subject: [PATCH 08/20] Use YYYYMMDD for crawl_date for DomainGraphExtractor.
 (#545)

- Resolves #544
- Update DomainGraphExtractor test
- Add import scala.language.postfixOps to resolve "postfix operator isNotNull should be enabled
by making the implicit value scala.language.postfixOps visible."
---
 .../archivesunleashed/app/DomainGraphExtractor.scala |  4 ++--
 .../archivesunleashed/app/PlainTextExtractor.scala   |  1 +
 .../app/DomainGraphExtractorTest.scala               | 12 ++++++------
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala b/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala
index ccbf4a8d..c1e2800e 100644
--- a/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala
@@ -19,7 +19,7 @@ package io.archivesunleashed.app
 import io.archivesunleashed.ArchiveRecord
 import io.archivesunleashed.df.DataFrameLoader
 import io.archivesunleashed.udfs.{extractDomain, removePrefixWWW}
-import org.apache.spark.sql.functions.desc
+import org.apache.spark.sql.functions.{desc, substring}
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object DomainGraphExtractor {
@@ -35,7 +35,7 @@ object DomainGraphExtractor {
     import spark.implicits._
     // scalastyle:on
     d.groupBy(
-        $"crawl_date",
+        substring($"crawl_date", 0, 8).as("crawl_date"),
         removePrefixWWW(extractDomain($"src")).as("src_domain"),
         removePrefixWWW(extractDomain($"dest")).as("dest_domain")
       )
diff --git a/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala b/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala
index e8deabc4..3fe6a7dc 100644
--- a/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala
@@ -20,6 +20,7 @@ import io.archivesunleashed.ArchiveRecord
 import io.archivesunleashed.udfs.{extractBoilerpipeText}
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 import org.apache.spark.sql.functions.lower
+import scala.language.postfixOps
 
 object PlainTextExtractor {
 
diff --git a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala
index 067101db..a5319637 100644
--- a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala
@@ -39,21 +39,21 @@ class DomainGraphExtractorDfTest extends FunSuite with BeforeAndAfter {
   }
 
   test("Domain graph extractor DF") {
-    val TESTLENGTH = 82
+    val TESTLENGTH = 10
     val df = RecordLoader.loadArchives(arcPath, sc).webgraph()
     val dfResult = DomainGraphExtractor(df).collect()
 
     assert(dfResult.length == TESTLENGTH)
 
-    assert(dfResult(0).get(0) == "20080430205151")
+    assert(dfResult(0).get(0) == "20080430")
     assert(dfResult(0).get(1) == "archive.org")
     assert(dfResult(0).get(2) == "archive.org")
-    assert(dfResult(0).get(3) == 10566)
+    assert(dfResult(0).get(3) == 37511)
 
-    assert(dfResult(1).get(0) == "20080430204948")
+    assert(dfResult(1).get(0) == "20080430")
     assert(dfResult(1).get(1) == "archive.org")
-    assert(dfResult(1).get(2) == "archive.org")
-    assert(dfResult(1).get(3) == 7143)
+    assert(dfResult(1).get(2) == "etree.org")
+    assert(dfResult(1).get(3) == 31)
   }
 
   after {

From 5468f219f18dda1fb032b588e9171ed9affa43cb Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Mon, 31 Oct 2022 13:51:54 -0400
Subject: [PATCH 09/20] [maven-release-plugin] prepare release aut-1.1.1

---
 pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index dc77bc28..6a009090 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
   <groupId>io.archivesunleashed</groupId>
   <artifactId>aut</artifactId>
   <packaging>jar</packaging>
-  <version>1.1.1-SNAPSHOT</version>
+  <version>1.1.1</version>
   <name>Archives Unleashed Toolkit</name>
   <description>An open-source toolkit for analyzing web archives.</description>
   <url>https://github.com/archivesunleashed/aut</url>
@@ -59,7 +59,7 @@
     <connection>scm:git:git@github.com:archivesunleashed/aut.git</connection>
     <developerConnection>scm:git:git@github.com:archivesunleashed/aut.git</developerConnection>
     <url>git@github.com:archivesunleashed/aut.git</url>
-    <tag>HEAD</tag>
+    <tag>aut-1.1.1</tag>
   </scm>
 
   <repositories>

From c1e63ba0f9c6a66ab957c7d7b6e3625496eb85a1 Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Mon, 31 Oct 2022 13:51:55 -0400
Subject: [PATCH 10/20] [maven-release-plugin] prepare for next development
 iteration

---
 README.md | 4 ++--
 pom.xml   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 080a00cd..1e2df257 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # The Archives Unleashed Toolkit
 [![codecov](https://codecov.io/gh/archivesunleashed/aut/branch/main/graph/badge.svg)](https://codecov.io/gh/archivesunleashed/aut)
 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut/badge.svg)](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut)
-[![Scaladoc](https://img.shields.io/badge/Scaladoc-1.1.0-blue?style=flat)](https://api.docs.archivesunleashed.io/1.1.0/scaladocs/io/archivesunleashed/index.html)
-[![UserDocs](https://img.shields.io/badge/UserDocs-1.1.0-blue?style=flat)](https://aut.docs.archivesunleashed.org/docs/home)
+[![Scaladoc](https://img.shields.io/badge/Scaladoc-1.1.1-blue?style=flat)](https://api.docs.archivesunleashed.io/1.1.1/scaladocs/io/archivesunleashed/index.html)
+[![UserDocs](https://img.shields.io/badge/UserDocs-1.1.1-blue?style=flat)](https://aut.docs.archivesunleashed.org/docs/home)
 [![LICENSE](https://img.shields.io/badge/license-Apache-blue.svg?style=flat)](https://www.apache.org/licenses/LICENSE-2.0)
 [![Contribution Guidelines](http://img.shields.io/badge/CONTRIBUTING-Guidelines-blue.svg)](./CONTRIBUTING.md)
 
diff --git a/pom.xml b/pom.xml
index 6a009090..2d616bcc 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
   <groupId>io.archivesunleashed</groupId>
   <artifactId>aut</artifactId>
   <packaging>jar</packaging>
-  <version>1.1.1</version>
+  <version>1.1.2-SNAPSHOT</version>
   <name>Archives Unleashed Toolkit</name>
   <description>An open-source toolkit for analyzing web archives.</description>
   <url>https://github.com/archivesunleashed/aut</url>
@@ -59,7 +59,7 @@
     <connection>scm:git:git@github.com:archivesunleashed/aut.git</connection>
     <developerConnection>scm:git:git@github.com:archivesunleashed/aut.git</developerConnection>
     <url>git@github.com:archivesunleashed/aut.git</url>
-    <tag>aut-1.1.1</tag>
+    <tag>HEAD</tag>
   </scm>
 
   <repositories>

From eeaa464fa82f4d51e7d609653c26073371eaae47 Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Mon, 31 Oct 2022 14:41:03 -0400
Subject: [PATCH 11/20] Update CHANGELOG for 1.1.1 release.

---
 CHANGELOG.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8faea86f..bbcf3a08 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Changelog
 
+## [aut-1.1.1](https://github.com/archivesunleashed/aut/tree/aut-1.1.1) (2022-10-31)
+
+[Full Changelog](https://github.com/archivesunleashed/aut/compare/aut-1.1.0...aut-1.1.1)
+
+**Fixed bugs:**
+
+- DomainGraph should use YYYYMMDD not YYYYMMDDHHMMSS [\#544](https://github.com/archivesunleashed/aut/issues/544)
+
+**Merged pull requests:**
+
+- Use YYYYMMDD for crawl\_date for DomainGraphExtractor. [\#545](https://github.com/archivesunleashed/aut/pull/545) ([ruebot](https://github.com/ruebot))
+- Bump jsoup from 1.14.2 to 1.15.3 [\#543](https://github.com/archivesunleashed/aut/pull/543) ([dependabot[bot]](https://github.com/apps/dependabot))
+
 ## [aut-1.1.0](https://github.com/archivesunleashed/aut/tree/aut-1.1.0) (2022-06-17)
 
 [Full Changelog](https://github.com/archivesunleashed/aut/compare/aut-1.0.0...aut-1.1.0)

From cdf8e769da0a6082bb1d98aa5007a773e70d0384 Mon Sep 17 00:00:00 2001
From: Nick Ruest <ruestn@gmail.com>
Date: Mon, 7 Nov 2022 12:43:02 -0500
Subject: [PATCH 12/20] Implements extracting `last_modified_date` of a
 resource where available. (#547)

* Adds `getLastModified` for `SparklingArchiveRecord`
* Adds `CovertLastModifiedDate` to convert RFC 1123 dates to `yyyyMMddHHmmss`
  * See: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Last-Modified
* Implement `last_modified_date` column for
  * `.all()`
  * `.webpages()`
  * `.images()`
  * `.pdfs()`
  * `.audio()`
  * `.videos()`
  * `.spreadsheets()`
  * `.presentationProgramFiles()`
  * `.wordProcessorFiles()`
  * `.css()`
  * `.html()`
  * `.js()`
  * `.json()`
  * `.plainText()`
  * `.xml()`
* Update tests
* Resolves #546
---
 .../io/archivesunleashed/ArchiveRecord.scala  |  3 +
 .../SparklingArchiveRecord.scala              | 11 ++-
 .../app/AudioInformationExtractor.scala       |  1 +
 .../app/ImageInformationExtractor.scala       |  1 +
 .../app/PDFInformationExtractor.scala         |  1 +
 ...sentationProgramInformationExtractor.scala |  1 +
 .../app/SpreadsheetInformationExtractor.scala |  1 +
 .../app/VideoInformationExtractor.scala       |  1 +
 .../WordProcessorInformationExtractor.scala   |  1 +
 .../matchbox/CovertLastModifiedDate.scala     | 65 +++++++++++++++
 .../scala/io/archivesunleashed/package.scala  | 82 ++++++++++++++++---
 .../io/archivesunleashed/RecordDFTest.scala   |  2 +-
 .../app/AudioInformationExtractorTest.scala   | 13 +--
 .../app/CssInformationExtractorTest.scala     | 15 ++--
 .../app/HtmlInformationExtractorTest.scala    | 13 +--
 .../app/ImageInformationExtractorTest.scala   | 17 ++--
 .../app/JsInformationExtractorTest.scala      | 15 ++--
 .../app/JsonInfromationExtractor.scala        | 15 ++--
 .../app/PDFInformationExtractorTest.scala     | 13 +--
 .../app/PlainTextInformationExtractor.scala   | 13 +--
 ...ationProgramInformationExtractorTest.scala | 15 ++--
 .../SpreadsheetInformationExtractorTest.scala | 15 ++--
 .../app/VideoInformationExtractorTest.scala   | 13 +--
 .../app/WebPagesExtractorTest.scala           |  9 +-
 ...ordProcessorInformationExtractorTest.scala | 13 +--
 .../app/XmlInfromationExtractor.scala         | 15 ++--
 26 files changed, 259 insertions(+), 105 deletions(-)
 create mode 100644 src/main/scala/io/archivesunleashed/matchbox/CovertLastModifiedDate.scala

diff --git a/src/main/scala/io/archivesunleashed/ArchiveRecord.scala b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala
index d26b2a7f..f474159c 100644
--- a/src/main/scala/io/archivesunleashed/ArchiveRecord.scala
+++ b/src/main/scala/io/archivesunleashed/ArchiveRecord.scala
@@ -48,4 +48,7 @@ trait ArchiveRecord extends Serializable {
 
   /** Returns payload digest (SHA1). */
   def getPayloadDigest: String
+
+  /** Returns last-modified date from HTTPS headers as a string. */
+  def getLastModified: String
 }
diff --git a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala
index c96b09bd..5eb656c3 100644
--- a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala
+++ b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala
@@ -22,7 +22,11 @@ import io.archivesunleashed.matchbox.ExtractDomain
 import org.apache.tika.io.BoundedInputStream
 import org.archive.webservices.sparkling.http.HttpMessage
 import org.archive.webservices.sparkling.io.IOUtil
-import org.archive.webservices.sparkling.util.{ManagedVal, ValueSupplier}
+import org.archive.webservices.sparkling.util.{
+  ManagedVal,
+  RegexUtil,
+  ValueSupplier
+}
 import org.archive.webservices.sparkling.warc.{WarcHeaders, WarcRecord}
 import scala.util.Try
 
@@ -60,6 +64,11 @@ class SparklingArchiveRecord(
     new SparklingArchiveRecord(filename, meta, payload, maxBodyLength)
   }
 
+  override def getLastModified: String =
+    http(warc)
+      .flatMap(_.headerMap.get("last-modified"))
+      .getOrElse("")
+
   override def getArchiveFilename: String = filename
 
   override def getCrawlDate: String =
diff --git a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
index 2d269082..93418897 100644
--- a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
@@ -35,6 +35,7 @@ object AudioInformationExtractor {
     // scalastyle:on
     d.select(
       $"crawl_date",
+      $"last_modified_date",
       $"url",
       $"filename",
       $"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
index d4ab80fa..29149036 100644
--- a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
@@ -35,6 +35,7 @@ object ImageInformationExtractor {
     // scalastyle:on
     d.select(
       $"crawl_date",
+      $"last_modified_date",
       $"url",
       $"filename",
       $"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
index 2d105c95..168b7fa3 100644
--- a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
@@ -35,6 +35,7 @@ object PDFInformationExtractor {
     // scalastyle:on
     d.select(
       $"crawl_date",
+      $"last_modified_date",
       $"url",
       $"filename",
       $"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
index 0db5868c..60a4132f 100644
--- a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
@@ -35,6 +35,7 @@ object PresentationProgramInformationExtractor {
     // scalastyle:on
     d.select(
       $"crawl_date",
+      $"last_modified_date",
       $"url",
       $"filename",
       $"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala
index 1ca25ac7..69597d29 100644
--- a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala
@@ -35,6 +35,7 @@ object SpreadsheetInformationExtractor {
     // scalastyle:on
     d.select(
       $"crawl_date",
+      $"last_modified_date",
       $"url",
       $"filename",
       $"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala
index f0839195..245d4618 100644
--- a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala
@@ -35,6 +35,7 @@ object VideoInformationExtractor {
     // scalastyle:on
     d.select(
       $"crawl_date",
+      $"last_modified_date",
       $"url",
       $"filename",
       $"extension",
diff --git a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala
index a8424741..914ca7e8 100644
--- a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala
@@ -35,6 +35,7 @@ object WordProcessorInformationExtractor {
     // scalastyle:on
     d.select(
       $"crawl_date",
+      $"last_modified_date",
       $"url",
       $"filename",
       $"extension",
diff --git a/src/main/scala/io/archivesunleashed/matchbox/CovertLastModifiedDate.scala b/src/main/scala/io/archivesunleashed/matchbox/CovertLastModifiedDate.scala
new file mode 100644
index 00000000..1156cae6
--- /dev/null
+++ b/src/main/scala/io/archivesunleashed/matchbox/CovertLastModifiedDate.scala
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package io.archivesunleashed.matchbox
+
+/** Converts RFC 1123 dates to yyyyMMddHHmmss. */
+object CovertLastModifiedDate {
+  val months = Seq(
+    "jan",
+    "feb",
+    "mar",
+    "apr",
+    "may",
+    "jun",
+    "jul",
+    "aug",
+    "sep",
+    "oct",
+    "nov",
+    "dec"
+  ).zipWithIndex.map { case (s, d) => (s, ("0" + (d + 1)).takeRight(2)) }
+
+  /** Converts last_modified_date to yyyyMMddHHmmss.
+    *
+    * @param lastModifiedDate date returned by `getLastModified`, formatted as RFC 1123
+    * @return last_modified_date as yyyyMMddHHmmss.
+    */
+  def apply(lastModifiedDate: String): String = {
+    if (lastModifiedDate.isEmpty) {
+      ""
+    } else {
+      // Credit: Helge Holzmann (@helgeho)
+      // Adapted from https://github.com/archivesunleashed/aut/pull/547#issuecomment-1302094573
+      val lc = lastModifiedDate.toLowerCase
+      val date = months.find(m => lc.contains(m._1)).map(_._2).flatMap { m =>
+        val d = lc
+          .replace(":", "")
+          .split(' ')
+          .drop(1)
+          .map(d => (d.length, d))
+          .toMap
+        for (y <- d.get(4); n <- d.get(2); t <- d.get(6))
+          yield y + m + n + t
+      }
+      date match {
+        case Some(date) =>
+          date
+        case None =>
+          ""
+      }
+    }
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala
index fec08399..6935594f 100644
--- a/src/main/scala/io/archivesunleashed/package.scala
+++ b/src/main/scala/io/archivesunleashed/package.scala
@@ -21,6 +21,7 @@ import java.security.MessageDigest
 import java.util.Base64
 
 import io.archivesunleashed.matchbox.{
+  CovertLastModifiedDate,
   DetectLanguage,
   DetectMimeTypeTika,
   ExtractDate,
@@ -170,6 +171,7 @@ package object archivesunleashed {
         .map(r =>
           Row(
             r.getCrawlDate,
+            CovertLastModifiedDate(r.getLastModified),
             ExtractDomain(r.getUrl).replaceAll("^\\s*www\\.", ""),
             r.getUrl,
             r.getMimeType,
@@ -183,6 +185,7 @@ package object archivesunleashed {
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("domain", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("mime_type_web_server", StringType, true))
@@ -225,6 +228,7 @@ package object archivesunleashed {
         .map(r =>
           Row(
             r.getCrawlDate,
+            CovertLastModifiedDate(r.getLastModified),
             ExtractDomain(r.getUrl).replaceAll("^\\s*www\\.", ""),
             r.getUrl,
             r.getMimeType,
@@ -236,6 +240,7 @@ package object archivesunleashed {
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("domain", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("mime_type_web_server", StringType, true))
@@ -304,6 +309,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), mimeTypeTika)
           (
             r.getCrawlDate,
+            CovertLastModifiedDate(r.getLastModified),
             r.getUrl,
             filename,
             extension,
@@ -328,12 +334,14 @@ package object archivesunleashed {
             t._8,
             t._9,
             t._10,
-            t._11
+            t._11,
+            t._12
           )
         )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -368,6 +376,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), r._2)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -378,10 +387,13 @@ package object archivesunleashed {
             encodedBytes
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -414,6 +426,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), r._2)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -424,10 +437,13 @@ package object archivesunleashed {
             encodedBytes
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -460,6 +476,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), r._2)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -470,10 +487,13 @@ package object archivesunleashed {
             encodedBytes
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -541,6 +561,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), mimeType)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -551,10 +572,13 @@ package object archivesunleashed {
             encodedBytes
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -602,6 +626,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), r._2)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -612,10 +637,13 @@ package object archivesunleashed {
             encodedBytes
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -667,6 +695,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), r._2)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -677,10 +706,13 @@ package object archivesunleashed {
             encodedBytes
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -713,6 +745,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), r._2)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -723,10 +756,13 @@ package object archivesunleashed {
             RemoveHTTPHeader(r._1.getContentString)
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -759,6 +795,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), r._2)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -769,10 +806,13 @@ package object archivesunleashed {
             RemoveHTTPHeader(r._1.getContentString)
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -805,6 +845,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), r._2)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -815,10 +856,13 @@ package object archivesunleashed {
             RemoveHTTPHeader(r._1.getContentString)
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -851,6 +895,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), r._2)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -861,10 +906,13 @@ package object archivesunleashed {
             RemoveHTTPHeader(r._1.getContentString)
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -898,6 +946,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), r._2)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -908,10 +957,13 @@ package object archivesunleashed {
             RemoveHTTPHeader(r._1.getContentString)
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
@@ -944,6 +996,7 @@ package object archivesunleashed {
           val extension = GetExtensionMIME(url.getPath(), r._2)
           (
             r._1.getCrawlDate,
+            CovertLastModifiedDate(r._1.getLastModified),
             r._1.getUrl,
             filename,
             extension,
@@ -954,10 +1007,13 @@ package object archivesunleashed {
             RemoveHTTPHeader(r._1.getContentString)
           )
         })
-        .map(t => Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9))
+        .map(t =>
+          Row(t._1, t._2, t._3, t._4, t._5, t._6, t._7, t._8, t._9, t._10)
+        )
 
       val schema = new StructType()
         .add(StructField("crawl_date", StringType, true))
+        .add(StructField("last_modified_date", StringType, true))
         .add(StructField("url", StringType, true))
         .add(StructField("filename", StringType, true))
         .add(StructField("extension", StringType, true))
diff --git a/src/test/scala/io/archivesunleashed/RecordDFTest.scala b/src/test/scala/io/archivesunleashed/RecordDFTest.scala
index 0648d597..b8961bfc 100644
--- a/src/test/scala/io/archivesunleashed/RecordDFTest.scala
+++ b/src/test/scala/io/archivesunleashed/RecordDFTest.scala
@@ -56,7 +56,7 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
   }
 
   test("Keep valid pages DF") {
-    val expected = "http://www.archive.org/"
+    val expected = "archive.org"
     val base = RecordLoader
       .loadArchives(arcPath, sc)
       .all()
diff --git a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala
index bd2b850a..4561f4bd 100644
--- a/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/AudioInformationExtractorTest.scala
@@ -46,13 +46,14 @@ class AudioInformationExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20190817230242")
-    assert(dfResults(0).get(1) == "https://ruebot.net/files/feniz.mp3")
-    assert(dfResults(0).get(2) == "feniz.mp3")
-    assert(dfResults(0).get(3) == "mp3")
-    assert(dfResults(0).get(4) == "audio/mpeg")
+    assert(dfResults(0).get(1) == "20111026005826")
+    assert(dfResults(0).get(2) == "https://ruebot.net/files/feniz.mp3")
+    assert(dfResults(0).get(3) == "feniz.mp3")
+    assert(dfResults(0).get(4) == "mp3")
     assert(dfResults(0).get(5) == "audio/mpeg")
-    assert(dfResults(0).get(6) == "f7e7ec84b12c294e19af1ba41732c733")
-    assert(dfResults(0).get(7) == "a3eb95dbbea76460529d0d9ebdde5faabaff544a")
+    assert(dfResults(0).get(6) == "audio/mpeg")
+    assert(dfResults(0).get(7) == "f7e7ec84b12c294e19af1ba41732c733")
+    assert(dfResults(0).get(8) == "a3eb95dbbea76460529d0d9ebdde5faabaff544a")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/CssInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/CssInformationExtractorTest.scala
index 1c0caee2..fd11e829 100644
--- a/src/test/scala/io/archivesunleashed/app/CssInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/CssInformationExtractorTest.scala
@@ -45,15 +45,16 @@ class CssInformationExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20080430204833")
+    assert(dfResults(0).get(1) == "20080422021044")
     assert(
-      dfResults(0).get(1) == "http://www.archive.org/stylesheets/details.css"
+      dfResults(0).get(2) == "http://www.archive.org/stylesheets/details.css"
     )
-    assert(dfResults(0).get(2) == "details.css")
-    assert(dfResults(0).get(3) == "css")
-    assert(dfResults(0).get(4) == "text/css")
-    assert(dfResults(0).get(5) == "text/plain")
-    assert(dfResults(0).get(6) == "f675020391de85d915a5ec65eb52e1c9")
-    assert(dfResults(0).get(7) == "2961a59b8fc20f401e1927dd0b63e5ae6e833f7a")
+    assert(dfResults(0).get(3) == "details.css")
+    assert(dfResults(0).get(4) == "css")
+    assert(dfResults(0).get(5) == "text/css")
+    assert(dfResults(0).get(6) == "text/plain")
+    assert(dfResults(0).get(7) == "f675020391de85d915a5ec65eb52e1c9")
+    assert(dfResults(0).get(8) == "2961a59b8fc20f401e1927dd0b63e5ae6e833f7a")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/HtmlInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/HtmlInformationExtractorTest.scala
index a0810fc0..3c461e03 100644
--- a/src/test/scala/io/archivesunleashed/app/HtmlInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/HtmlInformationExtractorTest.scala
@@ -45,13 +45,14 @@ class HtmlInformationExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20080430204826")
-    assert(dfResults(0).get(1) == "http://www.archive.org/")
-    assert(dfResults(0).get(2) == "")
-    assert(dfResults(0).get(3) == "html")
-    assert(dfResults(0).get(4) == "text/html")
+    assert(dfResults(0).get(1) == "20080109231829")
+    assert(dfResults(0).get(2) == "http://www.archive.org/")
+    assert(dfResults(0).get(3) == "")
+    assert(dfResults(0).get(4) == "html")
     assert(dfResults(0).get(5) == "text/html")
-    assert(dfResults(0).get(6) == "990fc5f1674fd21b9a035cf9193c3f10")
-    assert(dfResults(0).get(7) == "d5817bf5b4b35a296823509dd754700a6ad522b5")
+    assert(dfResults(0).get(6) == "text/html")
+    assert(dfResults(0).get(7) == "990fc5f1674fd21b9a035cf9193c3f10")
+    assert(dfResults(0).get(8) == "d5817bf5b4b35a296823509dd754700a6ad522b5")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala
index 727edff5..1ab3a501 100644
--- a/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/ImageInformationExtractorTest.scala
@@ -45,15 +45,16 @@ class ImageInformationExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20080430204829")
-    assert(dfResults(0).get(1) == "http://www.archive.org/images/logoc.jpg")
-    assert(dfResults(0).get(2) == "logoc.jpg")
-    assert(dfResults(0).get(3) == "jpg")
-    assert(dfResults(0).get(4) == "image/jpeg")
+    assert(dfResults(0).get(1) == "20030616222851")
+    assert(dfResults(0).get(2) == "http://www.archive.org/images/logoc.jpg")
+    assert(dfResults(0).get(3) == "logoc.jpg")
+    assert(dfResults(0).get(4) == "jpg")
     assert(dfResults(0).get(5) == "image/jpeg")
-    assert(dfResults(0).get(6) == 70)
-    assert(dfResults(0).get(7) == 56)
-    assert(dfResults(0).get(8) == "8211d1fbb9b03d8522a1ae378f9d1b24")
-    assert(dfResults(0).get(9) == "a671e68fc211ee4996a91e99297f246b2c5faa1a")
+    assert(dfResults(0).get(6) == "image/jpeg")
+    assert(dfResults(0).get(7) == 70)
+    assert(dfResults(0).get(8) == 56)
+    assert(dfResults(0).get(9) == "8211d1fbb9b03d8522a1ae378f9d1b24")
+    assert(dfResults(0).get(10) == "a671e68fc211ee4996a91e99297f246b2c5faa1a")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/JsInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/JsInformationExtractorTest.scala
index cf72804c..886dec49 100644
--- a/src/test/scala/io/archivesunleashed/app/JsInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/JsInformationExtractorTest.scala
@@ -45,13 +45,14 @@ class JsInformationExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20080430204833")
-    assert(dfResults(0).get(1) == "http://www.archive.org/flv/flv.js?v=1.34")
-    assert(dfResults(0).get(2) == "flv.js")
-    assert(dfResults(0).get(3) == "js")
-    assert(dfResults(0).get(4) == "application/x-javascript")
-    assert(dfResults(0).get(5) == "text/plain")
-    assert(dfResults(0).get(6) == "8c73985a47e0d3720765d92fbde8cc9f")
-    assert(dfResults(0).get(7) == "83a0951127abb1da11b141ad22ac72c20f2b4804")
+    assert(dfResults(0).get(1) == "20080430064607")
+    assert(dfResults(0).get(2) == "http://www.archive.org/flv/flv.js?v=1.34")
+    assert(dfResults(0).get(3) == "flv.js")
+    assert(dfResults(0).get(4) == "js")
+    assert(dfResults(0).get(5) == "application/x-javascript")
+    assert(dfResults(0).get(6) == "text/plain")
+    assert(dfResults(0).get(7) == "8c73985a47e0d3720765d92fbde8cc9f")
+    assert(dfResults(0).get(8) == "83a0951127abb1da11b141ad22ac72c20f2b4804")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/JsonInfromationExtractor.scala b/src/test/scala/io/archivesunleashed/app/JsonInfromationExtractor.scala
index 5db998c4..869bbda1 100644
--- a/src/test/scala/io/archivesunleashed/app/JsonInfromationExtractor.scala
+++ b/src/test/scala/io/archivesunleashed/app/JsonInfromationExtractor.scala
@@ -46,16 +46,17 @@ class JsonInformationExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20190812222538")
+    assert(dfResults(0).get(1) == "")
     assert(
       dfResults(0)
-        .get(1) == "https://api.plu.mx/widget/other/artifact?type=doi&id=10.1109%2FJCDL.2019.00043&href=https%3A%2F%2Fplu.mx%2Fpitt%2Fa%2F%3Fdoi%3D10.1109%2FJCDL.2019.00043&ref=https%3A%2F%2Fyorkspace.library.yorku.ca%2Fxmlui%2Fhandle%2F10315%2F36158&pageToken=f74d46f3-f622-c670-e1bc-bdc3-aa500a283693&isElsWidget=false"
+        .get(2) == "https://api.plu.mx/widget/other/artifact?type=doi&id=10.1109%2FJCDL.2019.00043&href=https%3A%2F%2Fplu.mx%2Fpitt%2Fa%2F%3Fdoi%3D10.1109%2FJCDL.2019.00043&ref=https%3A%2F%2Fyorkspace.library.yorku.ca%2Fxmlui%2Fhandle%2F10315%2F36158&pageToken=f74d46f3-f622-c670-e1bc-bdc3-aa500a283693&isElsWidget=false"
     )
-    assert(dfResults(0).get(2) == "artifact")
-    assert(dfResults(0).get(3) == "json")
-    assert(dfResults(0).get(4) == "application/json")
-    assert(dfResults(0).get(5) == "N/A")
-    assert(dfResults(0).get(6) == "d41d8cd98f00b204e9800998ecf8427e")
-    assert(dfResults(0).get(7) == "da39a3ee5e6b4b0d3255bfef95601890afd80709")
+    assert(dfResults(0).get(3) == "artifact")
+    assert(dfResults(0).get(4) == "json")
+    assert(dfResults(0).get(5) == "application/json")
+    assert(dfResults(0).get(6) == "N/A")
+    assert(dfResults(0).get(7) == "d41d8cd98f00b204e9800998ecf8427e")
+    assert(dfResults(0).get(8) == "da39a3ee5e6b4b0d3255bfef95601890afd80709")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala
index 2cc9af90..cdd71b99 100644
--- a/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/PDFInformationExtractorTest.scala
@@ -46,17 +46,18 @@ class PDFInformationExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20190812222529")
+    assert(dfResults(0).get(1) == "20190626132632")
     assert(
       dfResults(0).get(
-        1
+        2
       ) == "https://yorkspace.library.yorku.ca/xmlui/bitstream/handle/10315/36158/cost-analysis.pdf?sequence=1&isAllowed=y"
     )
-    assert(dfResults(0).get(2) == "cost-analysis.pdf")
-    assert(dfResults(0).get(3) == "pdf")
-    assert(dfResults(0).get(4) == "application/pdf")
+    assert(dfResults(0).get(3) == "cost-analysis.pdf")
+    assert(dfResults(0).get(4) == "pdf")
     assert(dfResults(0).get(5) == "application/pdf")
-    assert(dfResults(0).get(6) == "aaba59d2287afd40c996488a39bbc0dd")
-    assert(dfResults(0).get(7) == "569c28e0e8faa6945d6ca88fcd9e195825052c71")
+    assert(dfResults(0).get(6) == "application/pdf")
+    assert(dfResults(0).get(7) == "aaba59d2287afd40c996488a39bbc0dd")
+    assert(dfResults(0).get(8) == "569c28e0e8faa6945d6ca88fcd9e195825052c71")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala b/src/test/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
index d1b62e01..06e2f670 100644
--- a/src/test/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
+++ b/src/test/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
@@ -45,13 +45,14 @@ class PlainTextInformationExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20080430204825")
-    assert(dfResults(0).get(1) == "http://www.archive.org/robots.txt")
-    assert(dfResults(0).get(2) == "robots.txt")
-    assert(dfResults(0).get(3) == "txt")
-    assert(dfResults(0).get(4) == "text/plain")
+    assert(dfResults(0).get(1) == "20080202194044")
+    assert(dfResults(0).get(2) == "http://www.archive.org/robots.txt")
+    assert(dfResults(0).get(3) == "robots.txt")
+    assert(dfResults(0).get(4) == "txt")
     assert(dfResults(0).get(5) == "text/plain")
-    assert(dfResults(0).get(6) == "a6d6869f680b1bdd0d27bf5a5f49482e")
-    assert(dfResults(0).get(7) == "95046652b71aaa1e8a5a6af91e24016dfeae7bd4")
+    assert(dfResults(0).get(6) == "text/plain")
+    assert(dfResults(0).get(7) == "a6d6869f680b1bdd0d27bf5a5f49482e")
+    assert(dfResults(0).get(8) == "95046652b71aaa1e8a5a6af91e24016dfeae7bd4")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala
index 5f425875..2404429a 100644
--- a/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/PresentationProgramInformationExtractorTest.scala
@@ -48,25 +48,26 @@ class PresentationProgramInformationExtractorTest
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20190815004338")
+    assert(dfResults(0).get(1) == "20190814234811")
     assert(
       dfResults(0).get(
-        1
+        2
       ) == "https://ruebot.net/files/aut-test-fixtures/aut-test-fixtures.pptx"
     )
-    assert(dfResults(0).get(2) == "aut-test-fixtures.pptx")
-    assert(dfResults(0).get(3) == "pptx")
+    assert(dfResults(0).get(3) == "aut-test-fixtures.pptx")
+    assert(dfResults(0).get(4) == "pptx")
     assert(
       dfResults(0).get(
-        4
+        5
       ) == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
     )
     assert(
       dfResults(0).get(
-        5
+        6
       ) == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
     )
-    assert(dfResults(0).get(6) == "7a7b1fe4b6d311376eaced9de3b682ee")
-    assert(dfResults(0).get(7) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec")
+    assert(dfResults(0).get(7) == "7a7b1fe4b6d311376eaced9de3b682ee")
+    assert(dfResults(0).get(8) == "86fadca47b134b68247ccde62da4ce3f62b4d2ec")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala
index ecedc8b7..de5b2499 100644
--- a/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/SpreadsheetInformationExtractorTest.scala
@@ -46,21 +46,22 @@ class SpreadsheetInformationExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20190815004345")
+    assert(dfResults(0).get(1) == "20190814234730")
     assert(
       dfResults(0).get(
-        1
+        2
       ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixture.ods"
     )
-    assert(dfResults(0).get(2) == "test-aut-fixture.ods")
-    assert(dfResults(0).get(3) == "ods")
+    assert(dfResults(0).get(3) == "test-aut-fixture.ods")
+    assert(dfResults(0).get(4) == "ods")
     assert(
-      dfResults(0).get(4) == "application/vnd.oasis.opendocument.spreadsheet"
+      dfResults(0).get(5) == "application/vnd.oasis.opendocument.spreadsheet"
     )
     assert(
-      dfResults(0).get(5) == "application/vnd.oasis.opendocument.spreadsheet"
+      dfResults(0).get(6) == "application/vnd.oasis.opendocument.spreadsheet"
     )
-    assert(dfResults(0).get(6) == "7f70280757d8beb2d1bfd6fb1b6ae6e9")
-    assert(dfResults(0).get(7) == "448c357e78317877a98a399448031a89f1dda6fb")
+    assert(dfResults(0).get(7) == "7f70280757d8beb2d1bfd6fb1b6ae6e9")
+    assert(dfResults(0).get(8) == "448c357e78317877a98a399448031a89f1dda6fb")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala
index 47ead7a6..ed8c2356 100644
--- a/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/VideoInformationExtractorTest.scala
@@ -46,15 +46,16 @@ class VideoInformationExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20190817230310")
+    assert(dfResults(0).get(1) == "20190812230929")
     assert(
-      dfResults(0).get(1) == "https://ruebot.net/2018-11-12%2016.14.11.mp4"
+      dfResults(0).get(2) == "https://ruebot.net/2018-11-12%2016.14.11.mp4"
     )
-    assert(dfResults(0).get(2) == "2018-11-12%2016.14.11.mp4")
-    assert(dfResults(0).get(3) == "mp4")
-    assert(dfResults(0).get(4) == "video/mp4")
+    assert(dfResults(0).get(3) == "2018-11-12%2016.14.11.mp4")
+    assert(dfResults(0).get(4) == "mp4")
     assert(dfResults(0).get(5) == "video/mp4")
-    assert(dfResults(0).get(6) == "2cde7de3213a87269957033f6315fce2")
-    assert(dfResults(0).get(7) == "f28c72fa4c0464a1a2b81fdc539b28cf574ac4c2")
+    assert(dfResults(0).get(6) == "video/mp4")
+    assert(dfResults(0).get(7) == "2cde7de3213a87269957033f6315fce2")
+    assert(dfResults(0).get(8) == "f28c72fa4c0464a1a2b81fdc539b28cf574ac4c2")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala
index 3654e6e4..a2be144c 100644
--- a/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/WebPagesExtractorTest.scala
@@ -45,11 +45,12 @@ class WebPagesExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20080430204826")
-    assert(dfResults(0).get(1) == "archive.org")
-    assert(dfResults(0).get(2) == "http://www.archive.org/")
-    assert(dfResults(0).get(3) == "text/html")
+    assert(dfResults(0).get(1) == "20080109231829")
+    assert(dfResults(0).get(2) == "archive.org")
+    assert(dfResults(0).get(3) == "http://www.archive.org/")
     assert(dfResults(0).get(4) == "text/html")
-    assert(dfResults(0).get(5) == "en")
+    assert(dfResults(0).get(5) == "text/html")
+    assert(dfResults(0).get(6) == "en")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala
index d3043f64..c6931090 100644
--- a/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/WordProcessorInformationExtractorTest.scala
@@ -48,17 +48,18 @@ class WordProcessorInformationExtractorTest
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20190815004423")
+    assert(dfResults(0).get(1) == "20190814234647")
     assert(
       dfResults(0).get(
-        1
+        2
       ) == "https://ruebot.net/files/aut-test-fixtures/test-aut-fixtures.rtf"
     )
-    assert(dfResults(0).get(2) == "test-aut-fixtures.rtf")
-    assert(dfResults(0).get(3) == "rtf")
-    assert(dfResults(0).get(4) == "application/rtf")
+    assert(dfResults(0).get(3) == "test-aut-fixtures.rtf")
+    assert(dfResults(0).get(4) == "rtf")
     assert(dfResults(0).get(5) == "application/rtf")
-    assert(dfResults(0).get(6) == "e483512b65ba44d71e843c57de2adeb7")
-    assert(dfResults(0).get(7) == "8cf3066421f0a07fcd6e7a3e86ebd447edf7cfcb")
+    assert(dfResults(0).get(6) == "application/rtf")
+    assert(dfResults(0).get(7) == "e483512b65ba44d71e843c57de2adeb7")
+    assert(dfResults(0).get(8) == "8cf3066421f0a07fcd6e7a3e86ebd447edf7cfcb")
   }
 
   after {
diff --git a/src/test/scala/io/archivesunleashed/app/XmlInfromationExtractor.scala b/src/test/scala/io/archivesunleashed/app/XmlInfromationExtractor.scala
index 3662ccf6..a4ae2bdf 100644
--- a/src/test/scala/io/archivesunleashed/app/XmlInfromationExtractor.scala
+++ b/src/test/scala/io/archivesunleashed/app/XmlInfromationExtractor.scala
@@ -45,16 +45,17 @@ class XmlInformationExtractorTest extends FunSuite with BeforeAndAfter {
 
     assert(dfResults.length == RESULTSLENGTH)
     assert(dfResults(0).get(0) == "20080430204830")
+    assert(dfResults(0).get(1) == "")
     assert(
       dfResults(0)
-        .get(1) == "http://www.archive.org/services/collection-rss.php"
+        .get(2) == "http://www.archive.org/services/collection-rss.php"
     )
-    assert(dfResults(0).get(2) == "collection-rss.php")
-    assert(dfResults(0).get(3) == "xml")
-    assert(dfResults(0).get(4) == "text/xml")
-    assert(dfResults(0).get(5) == "application/rss+xml")
-    assert(dfResults(0).get(6) == "647a665e6acc2141af6d377b02e16c99")
-    assert(dfResults(0).get(7) == "4dee969d37e188ce705c6b99b8a6ca62aa1418e5")
+    assert(dfResults(0).get(3) == "collection-rss.php")
+    assert(dfResults(0).get(4) == "xml")
+    assert(dfResults(0).get(5) == "text/xml")
+    assert(dfResults(0).get(6) == "application/rss+xml")
+    assert(dfResults(0).get(7) == "647a665e6acc2141af6d377b02e16c99")
+    assert(dfResults(0).get(8) == "4dee969d37e188ce705c6b99b8a6ca62aa1418e5")
   }
 
   after {

From 24bb5e55d19a357fdff3df5c220e9598ad6c9345 Mon Sep 17 00:00:00 2001
From: Nick Ruest <ruestn@gmail.com>
Date: Tue, 8 Nov 2022 15:10:24 -0500
Subject: [PATCH 13/20] Add scalafix and remove unused imports. (#548)

---
 pom.xml                                             | 13 +++++++++++++
 .../archivesunleashed/SparklingArchiveRecord.scala  |  6 +-----
 .../app/AudioInformationExtractor.scala             |  1 -
 .../io/archivesunleashed/app/CommandLineApp.scala   |  4 ++--
 .../app/CssInformationExtractor.scala               |  2 --
 .../app/DomainFrequencyExtractor.scala              |  3 ---
 .../app/DomainGraphExtractor.scala                  |  2 --
 .../app/ExtractPopularImagesDF.scala                |  4 +---
 .../app/HtmlInformationExtractor.scala              |  2 --
 .../archivesunleashed/app/ImageGraphExtractor.scala |  2 --
 .../app/ImageInformationExtractor.scala             |  1 -
 .../app/JsInformationExtractor.scala                |  2 --
 .../app/JsonInformationExtractor.scala              |  2 --
 .../app/PDFInformationExtractor.scala               |  1 -
 .../archivesunleashed/app/PlainTextExtractor.scala  |  3 +--
 .../app/PlainTextInformationExtractor.scala         |  2 --
 .../PresentationProgramInformationExtractor.scala   |  1 -
 .../app/SpreadsheetInformationExtractor.scala       |  1 -
 .../app/VideoInformationExtractor.scala             |  1 -
 .../archivesunleashed/app/WebGraphExtractor.scala   |  2 --
 .../archivesunleashed/app/WebPagesExtractor.scala   | 10 ----------
 .../app/WordProcessorInformationExtractor.scala     |  1 -
 .../app/XmlInformationExtractor.scala               |  2 --
 .../matchbox/ExtractBoilerpipeText.scala            |  1 -
 .../archivesunleashed/matchbox/ExtractLinks.scala   |  1 -
 .../matchbox/ExtractTextFromPDFs.scala              |  4 ----
 .../io/archivesunleashed/matchbox/RemoveHTML.scala  |  1 -
 .../io/archivesunleashed/matchbox/package.scala     |  1 -
 src/main/scala/io/archivesunleashed/package.scala   |  9 +++------
 .../scala/io/archivesunleashed/udfs/package.scala   |  3 ---
 .../scala/io/archivesunleashed/RecordDFTest.scala   |  2 +-
 .../archivesunleashed/df/DataFrameLoaderTest.scala  |  1 -
 .../df/ExtractAudioDetailsTest.scala                |  1 -
 .../io/archivesunleashed/df/ExtractDateDFTest.scala |  7 +------
 .../df/ExtractHyperlinksTest.scala                  |  2 +-
 .../df/ExtractImageDetailsTest.scala                |  1 -
 .../df/ExtractPDFDetailsTest.scala                  |  1 -
 .../df/ExtractPresentationProgramDetailsTest.scala  |  1 -
 .../df/ExtractSpreadsheetDetailsTest.scala          |  1 -
 .../df/ExtractVideoDetailsTest.scala                |  1 -
 .../df/ExtractWordProcessorDetailsTest.scala        |  1 -
 .../io/archivesunleashed/df/SimpleDfTest.scala      |  2 --
 .../matchbox/ExtractBoilerPipeTextTest.scala        |  2 --
 .../matchbox/ExtractLinksTest.scala                 |  2 --
 .../matchbox/GetExtensionMIMETest.scala             |  2 +-
 .../archivesunleashed/matchbox/RemoveHTMLTest.scala |  2 --
 46 files changed, 25 insertions(+), 92 deletions(-)

diff --git a/pom.xml b/pom.xml
index 2d616bcc..4d3b41ce 100644
--- a/pom.xml
+++ b/pom.xml
@@ -182,7 +182,15 @@
             <arg>-feature</arg>
             <arg>-explaintypes</arg>
             <arg>-target:jvm-1.8</arg>
+            <arg>-Ywarn-unused-import</arg>
           </args>
+          <compilerPlugins>
+            <compilerPlugin>
+              <groupId>org.scalameta</groupId>
+              <artifactId>semanticdb-scalac_${scala.version}</artifactId>
+              <version>4.6.0</version>
+            </compilerPlugin>
+          </compilerPlugins>
         </configuration>
       </plugin>
       <!-- For license header enforcement. -->
@@ -328,6 +336,11 @@
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <groupId>io.github.evis</groupId>
+        <artifactId>scalafix-maven-plugin_${scala.binary.version}</artifactId>
+        <version>0.1.7_0.10.4</version>
+      </plugin>
     </plugins>
   </build>
 
diff --git a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala
index 5eb656c3..d91016ed 100644
--- a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala
+++ b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala
@@ -22,11 +22,7 @@ import io.archivesunleashed.matchbox.ExtractDomain
 import org.apache.tika.io.BoundedInputStream
 import org.archive.webservices.sparkling.http.HttpMessage
 import org.archive.webservices.sparkling.io.IOUtil
-import org.archive.webservices.sparkling.util.{
-  ManagedVal,
-  RegexUtil,
-  ValueSupplier
-}
+import org.archive.webservices.sparkling.util.{ManagedVal, ValueSupplier}
 import org.archive.webservices.sparkling.warc.{WarcHeaders, WarcRecord}
 import scala.util.Try
 
diff --git a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
index 93418897..8d7b5af5 100644
--- a/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/AudioInformationExtractor.scala
@@ -16,7 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object AudioInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
index c1fc05bb..8340d6b7 100644
--- a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
+++ b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
@@ -18,9 +18,9 @@ package io.archivesunleashed.app
 import java.io.File
 import java.nio.file.{Files, Paths}
 
-import io.archivesunleashed.{ArchiveRecord, RecordLoader}
+import io.archivesunleashed.RecordLoader
 import org.apache.log4j.Logger
-import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.{Dataset, Row}
 import org.apache.spark.{SparkConf, SparkContext}
 import org.rogach.scallop.exceptions.ScallopException
 import org.rogach.scallop.ScallopConf
diff --git a/src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala
index 6d592428..96c93ad7 100644
--- a/src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.df.DataFrameLoader
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object CssInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala b/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala
index 192f466e..d9571748 100644
--- a/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/DomainFrequencyExtractor.scala
@@ -16,10 +16,7 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.df.DataFrameLoader
 import io.archivesunleashed.udfs.{extractDomain, removePrefixWWW}
-import org.apache.spark.sql.functions.desc
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object DomainFrequencyExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala b/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala
index c1e2800e..5fccd0d4 100644
--- a/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/DomainGraphExtractor.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.df.DataFrameLoader
 import io.archivesunleashed.udfs.{extractDomain, removePrefixWWW}
 import org.apache.spark.sql.functions.{desc, substring}
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
diff --git a/src/main/scala/io/archivesunleashed/app/ExtractPopularImagesDF.scala b/src/main/scala/io/archivesunleashed/app/ExtractPopularImagesDF.scala
index a3fe4847..d1c74285 100644
--- a/src/main/scala/io/archivesunleashed/app/ExtractPopularImagesDF.scala
+++ b/src/main/scala/io/archivesunleashed/app/ExtractPopularImagesDF.scala
@@ -15,10 +15,8 @@
  */
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import org.apache.spark.sql.functions.{col, desc, first}
+import org.apache.spark.sql.functions.{desc, first}
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
-import org.apache.spark.{RangePartitioner, SparkContext}
 
 /** Extract the most popular images from a DataFrame. */
 object ExtractPopularImagesDF {
diff --git a/src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala
index 7c43daa8..7335b294 100644
--- a/src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.df.DataFrameLoader
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object HtmlInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala
index 0362940f..4279661d 100644
--- a/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/ImageGraphExtractor.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.df.DataFrameLoader
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object ImageGraphExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
index 29149036..7ede789d 100644
--- a/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/ImageInformationExtractor.scala
@@ -16,7 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object ImageInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala
index 0e976239..c051dbc2 100644
--- a/src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.df.DataFrameLoader
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object JsInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala
index 998a55be..7137bf91 100644
--- a/src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.df.DataFrameLoader
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object JsonInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
index 168b7fa3..31224281 100644
--- a/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/PDFInformationExtractor.scala
@@ -16,7 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object PDFInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala b/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala
index 3fe6a7dc..2061d559 100644
--- a/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/PlainTextExtractor.scala
@@ -16,8 +16,7 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.udfs.{extractBoilerpipeText}
+import io.archivesunleashed.udfs.extractBoilerpipeText
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 import org.apache.spark.sql.functions.lower
 import scala.language.postfixOps
diff --git a/src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
index f4495204..a4d6d8b2 100644
--- a/src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.df.DataFrameLoader
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object PlainTextInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
index 60a4132f..29596abd 100644
--- a/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/PresentationProgramInformationExtractor.scala
@@ -16,7 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object PresentationProgramInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala
index 69597d29..ac12d598 100644
--- a/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/SpreadsheetInformationExtractor.scala
@@ -16,7 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object SpreadsheetInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala
index 245d4618..8579aef8 100644
--- a/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/VideoInformationExtractor.scala
@@ -16,7 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object VideoInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala b/src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala
index 86f5a561..1767cc6d 100644
--- a/src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/WebGraphExtractor.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.df.DataFrameLoader
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object WebGraphExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala b/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
index 216c93ba..f614ca6f 100644
--- a/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
@@ -16,13 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.udfs.{
-  extractDomain,
-  removeHTML,
-  removeHTTPHeader,
-  removePrefixWWW
-}
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object WebPagesExtractor {
@@ -35,9 +28,6 @@ object WebPagesExtractor {
     */
   def apply(d: DataFrame): Dataset[Row] = {
     val spark = SparkSession.builder().master("local").getOrCreate()
-    // scalastyle:off
-    import spark.implicits._
-    // scalastyle:on
     d
   }
 }
diff --git a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala
index 914ca7e8..5208141e 100644
--- a/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/WordProcessorInformationExtractor.scala
@@ -16,7 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object WordProcessorInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala
index 79ad75a2..2335005d 100644
--- a/src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala
+++ b/src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.app
 
-import io.archivesunleashed.ArchiveRecord
-import io.archivesunleashed.df.DataFrameLoader
 import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
 
 object XmlInformationExtractor {
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala
index 3b82f6ce..37594912 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractBoilerpipeText.scala
@@ -16,7 +16,6 @@
 package io.archivesunleashed.matchbox
 
 import de.l3s.boilerpipe.extractors.DefaultExtractor
-import java.io.IOException
 
 /** Extract raw text content from an HTML page, minus "boilerplate" content (using boilerpipe). */
 object ExtractBoilerpipeText {
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala
index ec2e14b0..f64d960e 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractLinks.scala
@@ -15,7 +15,6 @@
  */
 package io.archivesunleashed.matchbox
 
-import java.io.IOException
 import org.jsoup.Jsoup
 import org.jsoup.select.Elements
 import scala.collection.mutable
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFs.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFs.scala
index f5c2d949..49a69acf 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFs.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractTextFromPDFs.scala
@@ -15,11 +15,7 @@
  */
 package io.archivesunleashed.matchbox
 
-import java.io.ByteArrayInputStream
-import org.apache.tika.metadata.Metadata
-import org.apache.tika.parser.ParseContext
 import org.apache.tika.parser.pdf.PDFParser
-import org.apache.tika.sax.BodyContentHandler;
 
 /** Exacts texts from PDFs using Apache Tika. */
 object ExtractTextFromPDFs {
diff --git a/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala
index 49f061d0..41f03839 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/RemoveHTML.scala
@@ -15,7 +15,6 @@
  */
 package io.archivesunleashed.matchbox
 
-import java.io.IOException
 import org.jsoup.Jsoup
 
 /** Removes HTML markup with JSoup. */
diff --git a/src/main/scala/io/archivesunleashed/matchbox/package.scala b/src/main/scala/io/archivesunleashed/matchbox/package.scala
index fe954517..ce1a8cee 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/package.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/package.scala
@@ -17,7 +17,6 @@
 package io.archivesunleashed
 
 import java.io.IOException
-import java.security.MessageDigest
 import scala.xml.Utility.escape
 
 /** Package object which supplies implicits providing common UDF-related functionalities. */
diff --git a/src/main/scala/io/archivesunleashed/package.scala b/src/main/scala/io/archivesunleashed/package.scala
index 6935594f..b176314d 100644
--- a/src/main/scala/io/archivesunleashed/package.scala
+++ b/src/main/scala/io/archivesunleashed/package.scala
@@ -35,14 +35,12 @@ import io.archivesunleashed.matchbox.{
 }
 import io.archivesunleashed.matchbox.ExtractDate.DateComponent
 import io.archivesunleashed.matchbox.ExtractDate.DateComponent.DateComponent
-import java.net.URI
 import java.net.URL
 
 import org.apache.commons.codec.binary.Hex
 import org.apache.commons.io.FilenameUtils
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.functions.{lit, lower, udf}
 import org.apache.spark.sql.types.{
   BinaryType,
   IntegerType,
@@ -50,8 +48,8 @@ import org.apache.spark.sql.types.{
   StructField,
   StructType
 }
-import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
-import org.apache.spark.{RangePartitioner, SerializableWritable, SparkContext}
+import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import org.apache.spark.SparkContext
 import org.archive.webservices.sparkling.io.{HdfsIO, IOUtil}
 import org.archive.webservices.sparkling.util.{
   IteratorUtil,
@@ -59,12 +57,11 @@ import org.archive.webservices.sparkling.util.{
   RddUtil,
   ValueSupplier
 }
-import org.archive.webservices.sparkling.warc.{WarcLoader, WarcRecord}
+import org.archive.webservices.sparkling.warc.WarcLoader
 
 import scala.language.postfixOps
 import scala.reflect.ClassTag
 import scala.util.matching.Regex
-import scala.util.Try
 
 /**
   * Package object which supplies implicits to augment generic RDDs with AUT-specific transformations.
diff --git a/src/main/scala/io/archivesunleashed/udfs/package.scala b/src/main/scala/io/archivesunleashed/udfs/package.scala
index b3761b85..94bc8bc0 100644
--- a/src/main/scala/io/archivesunleashed/udfs/package.scala
+++ b/src/main/scala/io/archivesunleashed/udfs/package.scala
@@ -31,11 +31,8 @@ import io.archivesunleashed.matchbox.{
   RemoveHTML,
   RemoveHTTPHeader
 }
-import org.apache.commons.lang3.StringUtils
 import org.apache.spark.sql.expressions.UserDefinedFunction
 import org.apache.spark.sql.functions.udf
-import org.apache.spark.sql.SparkSession
-import scala.util.matching.Regex
 
 /** Package object providing UDFs for DataFrames in Scala and PySpark. * */
 package object udfs extends Serializable {
diff --git a/src/test/scala/io/archivesunleashed/RecordDFTest.scala b/src/test/scala/io/archivesunleashed/RecordDFTest.scala
index b8961bfc..4cef3046 100644
--- a/src/test/scala/io/archivesunleashed/RecordDFTest.scala
+++ b/src/test/scala/io/archivesunleashed/RecordDFTest.scala
@@ -34,7 +34,7 @@ import io.archivesunleashed.udfs.{
 }
 import com.google.common.io.Resources
 import org.apache.spark.sql.functions.lit
-import org.apache.spark.sql.{Dataset, Row, SparkSession}
+import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
index fb8f3de7..0354368c 100644
--- a/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/DataFrameLoaderTest.scala
@@ -16,7 +16,6 @@
 package io.archivesunleashed.df
 
 import com.google.common.io.Resources
-import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala
index faef838a..b8199d28 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractAudioDetailsTest.scala
@@ -18,7 +18,6 @@ package io.archivesunleashed
 
 import com.google.common.io.Resources
 import org.apache.spark.sql.functions.desc
-import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala
index 9c72e965..3dba27eb 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractDateDFTest.scala
@@ -17,12 +17,7 @@
 package io.archivesunleashed
 
 import com.google.common.io.Resources
-import io.archivesunleashed.udfs.{
-  extractDate,
-  extractDomain,
-  extractLinks,
-  removePrefixWWW
-}
+import io.archivesunleashed.udfs.{extractDate, extractLinks}
 import org.apache.spark.sql.functions.{array, explode_outer, lower, udf}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala
index 540ada72..b2447a77 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractHyperlinksTest.scala
@@ -17,7 +17,7 @@
 package io.archivesunleashed
 
 import com.google.common.io.Resources
-import io.archivesunleashed.udfs.{extractDomain, extractLinks, removePrefixWWW}
+import io.archivesunleashed.udfs.extractLinks
 import org.apache.spark.sql.functions.{array, explode_outer, lower, udf}
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala
index 523ea78f..9138942e 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractImageDetailsTest.scala
@@ -18,7 +18,6 @@ package io.archivesunleashed
 
 import com.google.common.io.Resources
 import org.apache.spark.sql.functions.desc
-import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala
index 5ffdf6d6..0c7797c7 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractPDFDetailsTest.scala
@@ -18,7 +18,6 @@ package io.archivesunleashed
 
 import com.google.common.io.Resources
 import org.apache.spark.sql.functions.desc
-import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala
index c09d432f..265ac913 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractPresentationProgramDetailsTest.scala
@@ -18,7 +18,6 @@ package io.archivesunleashed
 
 import com.google.common.io.Resources
 import org.apache.spark.sql.functions.desc
-import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala
index a36c3742..27bfd842 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractSpreadsheetDetailsTest.scala
@@ -18,7 +18,6 @@ package io.archivesunleashed
 
 import com.google.common.io.Resources
 import org.apache.spark.sql.functions.desc
-import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala
index b0bf7abd..612dd790 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractVideoDetailsTest.scala
@@ -18,7 +18,6 @@ package io.archivesunleashed
 
 import com.google.common.io.Resources
 import org.apache.spark.sql.functions.desc
-import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala b/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala
index bc3e9100..d20d4955 100644
--- a/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/ExtractWordProcessorDetailsTest.scala
@@ -18,7 +18,6 @@ package io.archivesunleashed
 
 import com.google.common.io.Resources
 import org.apache.spark.sql.functions.desc
-import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
 import org.junit.runner.RunWith
 import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala
index 7088bf09..c12fe3f0 100644
--- a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala
@@ -17,8 +17,6 @@
 package io.archivesunleashed
 
 import com.google.common.io.Resources
-import io.archivesunleashed.udfs.extractDomain
-import org.apache.spark.sql.functions.desc
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.{SparkConf, SparkContext}
 import org.junit.runner.RunWith
diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala
index c036df3a..b57069a0 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractBoilerPipeTextTest.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.matchbox
 
-import java.io.IOException
-
 import org.junit.runner.RunWith
 import org.scalatest.FunSuite
 import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala
index e0d78e8f..7da5031a 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractLinksTest.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.matchbox
 
-import java.io.IOException
-
 import org.junit.runner.RunWith
 import org.scalatest.FunSuite
 import org.scalatest.junit.JUnitRunner
diff --git a/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala b/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala
index 66f64454..ef00b902 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/GetExtensionMIMETest.scala
@@ -17,7 +17,7 @@
 package io.archivesunleashed.matchbox
 
 import com.google.common.io.Resources
-import io.archivesunleashed.{ArchiveRecord, RecordLoader}
+import io.archivesunleashed.RecordLoader
 import org.apache.spark.sql.functions.desc
 import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.{SparkConf, SparkContext}
diff --git a/src/test/scala/io/archivesunleashed/matchbox/RemoveHTMLTest.scala b/src/test/scala/io/archivesunleashed/matchbox/RemoveHTMLTest.scala
index baa034c1..bbda8788 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/RemoveHTMLTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/RemoveHTMLTest.scala
@@ -16,8 +16,6 @@
 
 package io.archivesunleashed.matchbox
 
-import java.io.IOException
-
 import org.junit.runner.RunWith
 import org.scalatest.FunSuite
 import org.scalatest.junit.JUnitRunner

From 569b3fe02870f05f64f2e5f55ceddb3e276b3dc4 Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Wed, 16 Nov 2022 20:00:59 -0500
Subject: [PATCH 14/20] [maven-release-plugin] prepare release aut-1.2.0

---
 pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pom.xml b/pom.xml
index 4d3b41ce..9f9e5910 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
   <groupId>io.archivesunleashed</groupId>
   <artifactId>aut</artifactId>
   <packaging>jar</packaging>
-  <version>1.1.2-SNAPSHOT</version>
+  <version>1.2.0</version>
   <name>Archives Unleashed Toolkit</name>
   <description>An open-source toolkit for analyzing web archives.</description>
   <url>https://github.com/archivesunleashed/aut</url>
@@ -59,7 +59,7 @@
     <connection>scm:git:git@github.com:archivesunleashed/aut.git</connection>
     <developerConnection>scm:git:git@github.com:archivesunleashed/aut.git</developerConnection>
     <url>git@github.com:archivesunleashed/aut.git</url>
-    <tag>HEAD</tag>
+    <tag>aut-1.2.0</tag>
   </scm>
 
   <repositories>

From 6107542c14709b880b67ea2705abb071cc1ef675 Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Wed, 16 Nov 2022 20:01:00 -0500
Subject: [PATCH 15/20] [maven-release-plugin] prepare for next development
 iteration

---
 README.md | 4 ++--
 pom.xml   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 1e2df257..6d5ce71a 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # The Archives Unleashed Toolkit
 [![codecov](https://codecov.io/gh/archivesunleashed/aut/branch/main/graph/badge.svg)](https://codecov.io/gh/archivesunleashed/aut)
 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut/badge.svg)](https://maven-badges.herokuapp.com/maven-central/io.archivesunleashed/aut)
-[![Scaladoc](https://img.shields.io/badge/Scaladoc-1.1.1-blue?style=flat)](https://api.docs.archivesunleashed.io/1.1.1/scaladocs/io/archivesunleashed/index.html)
-[![UserDocs](https://img.shields.io/badge/UserDocs-1.1.1-blue?style=flat)](https://aut.docs.archivesunleashed.org/docs/home)
+[![Scaladoc](https://img.shields.io/badge/Scaladoc-1.2.0-blue?style=flat)](https://api.docs.archivesunleashed.io/1.2.0/scaladocs/io/archivesunleashed/index.html)
+[![UserDocs](https://img.shields.io/badge/UserDocs-1.2.0-blue?style=flat)](https://aut.docs.archivesunleashed.org/docs/home)
 [![LICENSE](https://img.shields.io/badge/license-Apache-blue.svg?style=flat)](https://www.apache.org/licenses/LICENSE-2.0)
 [![Contribution Guidelines](http://img.shields.io/badge/CONTRIBUTING-Guidelines-blue.svg)](./CONTRIBUTING.md)
 
diff --git a/pom.xml b/pom.xml
index 9f9e5910..6dacbabc 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
   <groupId>io.archivesunleashed</groupId>
   <artifactId>aut</artifactId>
   <packaging>jar</packaging>
-  <version>1.2.0</version>
+  <version>1.2.1-SNAPSHOT</version>
   <name>Archives Unleashed Toolkit</name>
   <description>An open-source toolkit for analyzing web archives.</description>
   <url>https://github.com/archivesunleashed/aut</url>
@@ -59,7 +59,7 @@
     <connection>scm:git:git@github.com:archivesunleashed/aut.git</connection>
     <developerConnection>scm:git:git@github.com:archivesunleashed/aut.git</developerConnection>
     <url>git@github.com:archivesunleashed/aut.git</url>
-    <tag>aut-1.2.0</tag>
+    <tag>HEAD</tag>
   </scm>
 
   <repositories>

From a87216bc440d7cd55052c59938d8cc7333f4e87e Mon Sep 17 00:00:00 2001
From: nruest <ruestn@gmail.com>
Date: Wed, 16 Nov 2022 20:52:03 -0500
Subject: [PATCH 16/20] Update CHANGELOG for 1.2.0 release.

---
 CHANGELOG.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bbcf3a08..b25822f1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Changelog
 
+## [aut-1.2.0](https://github.com/archivesunleashed/aut/tree/aut-1.2.0) (2022-11-17)
+
+[Full Changelog](https://github.com/archivesunleashed/aut/compare/aut-1.1.1...aut-1.2.0)
+
+**Closed issues:**
+
+- Include last modified date for a resource [\#546](https://github.com/archivesunleashed/aut/issues/546)
+
+**Merged pull requests:**
+
+- Add scalafix and remove unused imports. [\#548](https://github.com/archivesunleashed/aut/pull/548) ([ruebot](https://github.com/ruebot))
+- Last modified headers [\#547](https://github.com/archivesunleashed/aut/pull/547) ([ruebot](https://github.com/ruebot))
+
 ## [aut-1.1.1](https://github.com/archivesunleashed/aut/tree/aut-1.1.1) (2022-10-31)
 
 [Full Changelog](https://github.com/archivesunleashed/aut/compare/aut-1.1.0...aut-1.1.1)

From afb6b5a0f94f70357e9072cdddc4e7dab8136c51 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 9 Jul 2023 17:35:48 -0400
Subject: [PATCH 17/20] Bump guava from 29.0-jre to 32.0.0-jre (#550)

Bumps [guava](https://github.com/google/guava) from 29.0-jre to 32.0.0-jre.
- [Release notes](https://github.com/google/guava/releases)
- [Commits](https://github.com/google/guava/commits)

---
updated-dependencies:
- dependency-name: com.google.guava:guava
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 6dacbabc..ba45bf14 100644
--- a/pom.xml
+++ b/pom.xml
@@ -24,7 +24,7 @@
     <scala.binary.version>2.12</scala.binary.version>
     <hadoop.version>2.7.4</hadoop.version>
     <spark.version>3.0.1</spark.version>
-    <guava.version>29.0-jre</guava.version>
+    <guava.version>32.0.0-jre</guava.version>
     <github.global.server>github</github.global.server>
     <license.plugin.version>3.0</license.plugin.version>
     <release.plugin.version>2.5.2</release.plugin.version>

From 549de14a42310804cb76475a89f72caef4d263be Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 9 Jul 2023 18:24:05 -0400
Subject: [PATCH 18/20] Bump snappy-java from 1.1.7.3 to 1.1.10.1 (#551)

Bumps [snappy-java](https://github.com/xerial/snappy-java) from 1.1.7.3 to 1.1.10.1.
- [Release notes](https://github.com/xerial/snappy-java/releases)
- [Commits](https://github.com/xerial/snappy-java/compare/1.1.7.3...v1.1.10.1)

---
updated-dependencies:
- dependency-name: org.xerial.snappy:snappy-java
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index ba45bf14..e6c7a431 100644
--- a/pom.xml
+++ b/pom.xml
@@ -480,7 +480,7 @@
     <dependency>
       <groupId>org.xerial.snappy</groupId>
       <artifactId>snappy-java</artifactId>
-      <version>1.1.7.3</version>
+      <version>1.1.10.1</version>
     </dependency>
     <dependency>
       <groupId>org.jsoup</groupId>

From 7646f4abf8b2a7b3b168dfe2dde67bc65bbe0282 Mon Sep 17 00:00:00 2001
From: Nick Ruest <ruestn@gmail.com>
Date: Wed, 21 Feb 2024 16:37:33 -0500
Subject: [PATCH 19/20] Update Apache Commons Compress dependency. (#555)

- dependabot alert
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index e6c7a431..345c47c8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -438,7 +438,7 @@
     <dependency>
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-compress</artifactId>
-      <version>1.21</version>
+      <version>[1.26.0,)</version>
     </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>

From 57c9b5ed167f97c9ea5e08ebd1a4a958c6da1818 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 27 Feb 2024 14:50:43 -0500
Subject: [PATCH 20/20] Bump org.xerial.snappy:snappy-java from 1.1.10.1 to
 1.1.10.4 (#554)

Bumps [org.xerial.snappy:snappy-java](https://github.com/xerial/snappy-java) from 1.1.10.1 to 1.1.10.4.
- [Release notes](https://github.com/xerial/snappy-java/releases)
- [Commits](https://github.com/xerial/snappy-java/compare/v1.1.10.1...v1.1.10.4)

---
updated-dependencies:
- dependency-name: org.xerial.snappy:snappy-java
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Nick Ruest <ruestn@gmail.com>
---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index 345c47c8..6aafe24d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -480,7 +480,7 @@
     <dependency>
       <groupId>org.xerial.snappy</groupId>
       <artifactId>snappy-java</artifactId>
-      <version>1.1.10.1</version>
+      <version>1.1.10.4</version>
     </dependency>
     <dependency>
       <groupId>org.jsoup</groupId>