Skip to content

Commit

Permalink
Add ARCH text files derivatives. (#541)
Browse files Browse the repository at this point in the history
- Add css, html, js, json, plain text, and xml information extraction
  methods
- Add app extractors
- Add Python implementation of extractors
- Add tests
- Resolves #540
- Fix for MIME types with extra data.
- Resolves #542
- Update Tika to 1.23
  • Loading branch information
ruebot committed Jun 17, 2022
1 parent 2b8b717 commit 8172855
Show file tree
Hide file tree
Showing 22 changed files with 1,418 additions and 16 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
<surefire.plugin.version>2.22.0</surefire.plugin.version>
<jacoco.plugin.version>0.8.4</jacoco.plugin.version>
<versions.plugin.version>2.1</versions.plugin.version>
<tika.version>1.22</tika.version>
<tika.version>1.23</tika.version>
<jackson.version>2.10.0</jackson.version>
<scala.maven.plugin.version>4.5.4</scala.maven.plugin.version>
</properties>
Expand Down
18 changes: 18 additions & 0 deletions src/main/python/aut/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,30 @@ def all(self):
def audio(self):
return DataFrame(self.loader.audio(self.path), self.sqlContext)

def css(self):
return DataFrame(self.loader.css(self.path), self.sqlContext)

def html(self):
return DataFrame(self.loader.html(self.path), self.sqlContext)

def imagegraph(self):
return DataFrame(self.loader.imagegraph(self.path), self.sqlContext)

def images(self):
return DataFrame(self.loader.images(self.path), self.sqlContext)

def js(self):
return DataFrame(self.loader.js(self.path), self.sqlContext)

def json(self):
return DataFrame(self.loader.json(self.path), self.sqlContext)

def pdfs(self):
return DataFrame(self.loader.pdfs(self.path), self.sqlContext)

def plain_text(self):
return DataFrame(self.loader.plainText(self.path), self.sqlContext)

def presentation_program(self):
return DataFrame(
self.loader.presentationProgramFiles(self.path), self.sqlContext
Expand All @@ -42,3 +57,6 @@ def webpages(self):

def word_processor(self):
return DataFrame(self.loader.wordProcessorFiles(self.path), self.sqlContext)

def xml(self):
return DataFrame(self.loader.xml(self.path), self.sqlContext)
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,10 @@ class SparklingArchiveRecord(
}.getOrElse("")

override def getMimeType: String =
http(warc).flatMap(_.mime).getOrElse("unknown")
http(warc)
.flatMap(_.mime)
.getOrElse("unknown")
.replaceAll(" .*|\\s|\\n", "")

override def getUrl: String = warc.url.getOrElse("").replaceAll("<|>", "")

Expand Down
102 changes: 102 additions & 0 deletions src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,108 @@ class CommandLineApp(conf: CmdAppConf) {
} else {
saveCsv(WordProcessorInformationExtractor(df))
}
}),
"CssInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader
.loadArchives(inputFiles.head, sparkCtx.get)
.css()
inputFiles.tail foreach { f =>
df = df.union(
RecordLoader.loadArchives(f, sparkCtx.get).css()
)
}
if (!configuration.outputFormat.isEmpty && configuration
.outputFormat() == "parquet") {
saveParquet(CssInformationExtractor(df))
} else {
saveCsv(CssInformationExtractor(df))
}
}),
"HtmlInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader
.loadArchives(inputFiles.head, sparkCtx.get)
.html()
inputFiles.tail foreach { f =>
df = df.union(
RecordLoader.loadArchives(f, sparkCtx.get).html()
)
}
if (!configuration.outputFormat.isEmpty && configuration
.outputFormat() == "parquet") {
saveParquet(HtmlInformationExtractor(df))
} else {
saveCsv(HtmlInformationExtractor(df))
}
}),
"JsInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader
.loadArchives(inputFiles.head, sparkCtx.get)
.js()
inputFiles.tail foreach { f =>
df = df.union(
RecordLoader.loadArchives(f, sparkCtx.get).js()
)
}
if (!configuration.outputFormat.isEmpty && configuration
.outputFormat() == "parquet") {
saveParquet(JsInformationExtractor(df))
} else {
saveCsv(JsInformationExtractor(df))
}
}),
"JsonInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader
.loadArchives(inputFiles.head, sparkCtx.get)
.json()
inputFiles.tail foreach { f =>
df = df.union(
RecordLoader.loadArchives(f, sparkCtx.get).json()
)
}
if (!configuration.outputFormat.isEmpty && configuration
.outputFormat() == "parquet") {
saveParquet(JsonInformationExtractor(df))
} else {
saveCsv(JsonInformationExtractor(df))
}
}),
"PlainTextInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader
.loadArchives(inputFiles.head, sparkCtx.get)
.plainText()
inputFiles.tail foreach { f =>
df = df.union(
RecordLoader.loadArchives(f, sparkCtx.get).plainText()
)
}
if (!configuration.outputFormat.isEmpty && configuration
.outputFormat() == "parquet") {
saveParquet(PlainTextInformationExtractor(df))
} else {
saveCsv(PlainTextInformationExtractor(df))
}
}),
"XmlInformationExtractor" ->
((inputFiles: List[String]) => {
var df = RecordLoader
.loadArchives(inputFiles.head, sparkCtx.get)
.xml()
inputFiles.tail foreach { f =>
df = df.union(
RecordLoader.loadArchives(f, sparkCtx.get).xml()
)
}
if (!configuration.outputFormat.isEmpty && configuration
.outputFormat() == "parquet") {
saveParquet(XmlInformationExtractor(df))
} else {
saveCsv(XmlInformationExtractor(df))
}
})
)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object CssInformationExtractor {

/** Extract web graph from web archive using DataFrame and Spark SQL.
*
* @param d DataFrame obtained from RecordLoader
* @return Dataset[Row], where the schema is (crawl date, src, image url,
* alt text)
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
d
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object HtmlInformationExtractor {

/** Extract web graph from web archive using DataFrame and Spark SQL.
*
* @param d DataFrame obtained from RecordLoader
* @return Dataset[Row], where the schema is (crawl date, src, image url,
* alt text)
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
d
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object JsInformationExtractor {

/** Extract web graph from web archive using DataFrame and Spark SQL.
*
* @param d DataFrame obtained from RecordLoader
* @return Dataset[Row], where the schema is (crawl date, src, image url,
* alt text)
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
d
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object JsonInformationExtractor {

/** Extract web graph from web archive using DataFrame and Spark SQL.
*
* @param d DataFrame obtained from RecordLoader
* @return Dataset[Row], where the schema is (crawl date, src, image url,
* alt text)
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
d
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object PlainTextInformationExtractor {

/** Extract web graph from web archive using DataFrame and Spark SQL.
*
* @param d DataFrame obtained from RecordLoader
* @return Dataset[Row], where the schema is (crawl date, src, image url,
* alt text)
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
d
}
}
10 changes: 1 addition & 9 deletions src/main/scala/io/archivesunleashed/app/WebPagesExtractor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,6 @@ object WebPagesExtractor {
// scalastyle:off
import spark.implicits._
// scalastyle:on
d.select(
$"crawl_date",
$"domain",
$"url",
$"mime_type_web_server",
$"mime_type_tika",
$"language",
$"content"
)
d
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright © 2017 The Archives Unleashed Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package io.archivesunleashed.app

import io.archivesunleashed.ArchiveRecord
import io.archivesunleashed.df.DataFrameLoader
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object XmlInformationExtractor {

/** Extract web graph from web archive using DataFrame and Spark SQL.
*
* @param d DataFrame obtained from RecordLoader
* @return Dataset[Row], where the schema is (crawl date, src, image url,
* alt text)
*/
def apply(d: DataFrame): Dataset[Row] = {
val spark = SparkSession.builder().master("local").getOrCreate()
d
}
}
Loading

0 comments on commit 8172855

Please sign in to comment.