Add ARCH text files derivatives. (#541)

- Add css, html, js, json, plain text, and xml information extraction methods - Add app extractors - Add Python implementation of extractors - Add tests - Resolves #540 - Fix for MIME types with extra data. - Resolves #542 - Update Tika to 1.23
archivesunleashed · Jun 17, 2022 · 8172855 · 8172855
1 parent 2b8b717
commit 8172855
Show file tree

Hide file tree

Showing 22 changed files with 1,418 additions and 16 deletions.
diff --git a/pom.xml b/pom.xml
@@ -42,7 +42,7 @@
     <surefire.plugin.version>2.22.0</surefire.plugin.version>
     <jacoco.plugin.version>0.8.4</jacoco.plugin.version>
     <versions.plugin.version>2.1</versions.plugin.version>
-    <tika.version>1.22</tika.version>
+    <tika.version>1.23</tika.version>
     <jackson.version>2.10.0</jackson.version>
     <scala.maven.plugin.version>4.5.4</scala.maven.plugin.version>
   </properties>

diff --git a/src/main/python/aut/common.py b/src/main/python/aut/common.py
@@ -14,15 +14,30 @@ def all(self):
     def audio(self):
         return DataFrame(self.loader.audio(self.path), self.sqlContext)
 
+    def css(self):
+        return DataFrame(self.loader.css(self.path), self.sqlContext)
+
+    def html(self):
+        return DataFrame(self.loader.html(self.path), self.sqlContext)
+
     def imagegraph(self):
         return DataFrame(self.loader.imagegraph(self.path), self.sqlContext)
 
     def images(self):
         return DataFrame(self.loader.images(self.path), self.sqlContext)
 
+    def js(self):
+        return DataFrame(self.loader.js(self.path), self.sqlContext)
+
+    def json(self):
+        return DataFrame(self.loader.json(self.path), self.sqlContext)
+
     def pdfs(self):
         return DataFrame(self.loader.pdfs(self.path), self.sqlContext)
 
+    def plain_text(self):
+        return DataFrame(self.loader.plainText(self.path), self.sqlContext)
+
     def presentation_program(self):
         return DataFrame(
             self.loader.presentationProgramFiles(self.path), self.sqlContext
@@ -42,3 +57,6 @@ def webpages(self):
 
     def word_processor(self):
         return DataFrame(self.loader.wordProcessorFiles(self.path), self.sqlContext)
+
+    def xml(self):
+        return DataFrame(self.loader.xml(self.path), self.sqlContext)
diff --git a/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala b/src/main/scala/io/archivesunleashed/SparklingArchiveRecord.scala
@@ -81,7 +81,10 @@ class SparklingArchiveRecord(
     }.getOrElse("")
 
   override def getMimeType: String =
-    http(warc).flatMap(_.mime).getOrElse("unknown")
+    http(warc)
+      .flatMap(_.mime)
+      .getOrElse("unknown")
+      .replaceAll(" .*|\\s|\\n", "")
 
   override def getUrl: String = warc.url.getOrElse("").replaceAll("<|>", "")
 

diff --git a/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala b/src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
@@ -313,6 +313,108 @@ class CommandLineApp(conf: CmdAppConf) {
         } else {
           saveCsv(WordProcessorInformationExtractor(df))
         }
+      }),
+    "CssInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .css()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).css()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(CssInformationExtractor(df))
+        } else {
+          saveCsv(CssInformationExtractor(df))
+        }
+      }),
+    "HtmlInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .html()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).html()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(HtmlInformationExtractor(df))
+        } else {
+          saveCsv(HtmlInformationExtractor(df))
+        }
+      }),
+    "JsInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .js()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).js()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(JsInformationExtractor(df))
+        } else {
+          saveCsv(JsInformationExtractor(df))
+        }
+      }),
+    "JsonInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .json()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).json()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(JsonInformationExtractor(df))
+        } else {
+          saveCsv(JsonInformationExtractor(df))
+        }
+      }),
+    "PlainTextInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .plainText()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).plainText()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(PlainTextInformationExtractor(df))
+        } else {
+          saveCsv(PlainTextInformationExtractor(df))
+        }
+      }),
+    "XmlInformationExtractor" ->
+      ((inputFiles: List[String]) => {
+        var df = RecordLoader
+          .loadArchives(inputFiles.head, sparkCtx.get)
+          .xml()
+        inputFiles.tail foreach { f =>
+          df = df.union(
+            RecordLoader.loadArchives(f, sparkCtx.get).xml()
+          )
+        }
+        if (!configuration.outputFormat.isEmpty && configuration
+              .outputFormat() == "parquet") {
+          saveParquet(XmlInformationExtractor(df))
+        } else {
+          saveCsv(XmlInformationExtractor(df))
+        }
       })
   )
 

diff --git a/src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/CssInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object CssInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/HtmlInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object HtmlInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/JsInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object JsInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/JsonInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object JsonInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/PlainTextInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object PlainTextInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}
diff --git a/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala b/src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
@@ -38,14 +38,6 @@ object WebPagesExtractor {
     // scalastyle:off
     import spark.implicits._
     // scalastyle:on
-    d.select(
-      $"crawl_date",
-      $"domain",
-      $"url",
-      $"mime_type_web_server",
-      $"mime_type_tika",
-      $"language",
-      $"content"
-    )
+    d
   }
 }
diff --git a/src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala b/src/main/scala/io/archivesunleashed/app/XmlInformationExtractor.scala
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2017 The Archives Unleashed Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.archivesunleashed.app
+
+import io.archivesunleashed.ArchiveRecord
+import io.archivesunleashed.df.DataFrameLoader
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
+
+object XmlInformationExtractor {
+
+  /** Extract web graph from web archive using DataFrame and Spark SQL.
+    *
+    * @param d DataFrame obtained from RecordLoader
+    * @return Dataset[Row], where the schema is (crawl date, src, image url,
+    *   alt text)
+    */
+  def apply(d: DataFrame): Dataset[Row] = {
+    val spark = SparkSession.builder().master("local").getOrCreate()
+    d
+  }
+}