Skip to content

Commit

Permalink
Update to Spark 2.4.3 and update Tika to 1.20. (#321)
Browse files Browse the repository at this point in the history
* Update to Spark 2.4.3 and update Tika to 1.20.

- Resolves #295
- Resolves #308
- Resolves #286
- Pulls in unfinished work by @jrwiebe and @borislin.

* Add patched lang-detector
  • Loading branch information
ruebot authored and ianmilligan1 committed Jul 17, 2019
1 parent 20ffeeb commit 0e701b2
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 85 deletions.
233 changes: 155 additions & 78 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
<project_organization>The Archives Unleashed Project</project_organization>
<scala.version>2.11.8</scala.version>
<hadoop.version>2.6.5</hadoop.version>
<spark.version>2.3.1</spark.version>
<spark.version>2.4.3</spark.version>
<github.global.server>github</github.global.server>
<checkstyle.plugin.version>2.17</checkstyle.plugin.version>
<license.plugin.version>3.0</license.plugin.version>
Expand All @@ -42,6 +42,7 @@
<surefire.plugin.version>2.18.1</surefire.plugin.version>
<jacoco.plugin.version>0.7.5.201505241946</jacoco.plugin.version>
<versions.plugin.version>2.1</versions.plugin.version>
<tika.version>1.20</tika.version>
</properties>

<licenses>
Expand All @@ -64,6 +65,10 @@
<id>maven</id>
<url>http://repo.maven.apache.org/maven2/</url>
</repository>
<repository>
<id>jitpack.io</id>
<url>https://jitpack.io</url>
</repository>
</repositories>

<build>
Expand Down Expand Up @@ -480,25 +485,98 @@
<artifactId>scala-parser-combinators_2.11</artifactId>
<version>1.0.5</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_2.11</artifactId>
<version>2.8.8</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
<exclusion>
<groupId>javax.xml.bind</groupId>
<artifactId>jaxb-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
<exclusions>
<exclusion>
<groupId>commons-configuration</groupId>
<artifactId>commons-configuration</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
</exclusion>
<exclusion>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</exclusion>
<exclusion>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</exclusion>
<exclusion>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>${spark.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
</exclusion>
<exclusion>
<groupId>com.thoughtworks.paranamer</groupId>
<artifactId>paranamer</artifactId>
</exclusion>
<exclusion>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>jul-to-slf4j</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
</exclusion>
<exclusion>
<groupId>javax.ws.rs</groupId>
<artifactId>javax.ws.rs-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
Expand All @@ -510,6 +588,11 @@
<artifactId>spark-graphx_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>15.0</version>
</dependency>
<dependency>
<groupId>org.xerial.snappy</groupId>
<artifactId>snappy-java</artifactId>
Expand All @@ -529,22 +612,69 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-core</artifactId>
</exclusion>
<exclusion>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
</exclusion>
<exclusion>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
</exclusion>
<exclusion>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>edu.stanford.nlp</groupId>
<artifactId>stanford-corenlp</artifactId>
<version>3.8.0</version>
<exclusions>
<exclusion>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</exclusion>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.19.1</version>
<version>${tika.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.19.1</version>
<version>${tika.version}</version>
<exclusions>
<exclusion>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</exclusion>
<exclusion>
<groupId>javax.xml.bind</groupId>
<artifactId>jaxb-api</artifactId>
</exclusion>
<exclusion>
<groupId>javax.ws.rs</groupId>
<artifactId>javax.ws.rs-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-langdetect</artifactId>
<version>${tika.version}</version>
<exclusions>
<exclusion>
<groupId>com.optimaize.languagedetector</groupId>
<artifactId>language-detector</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.rogach</groupId>
Expand All @@ -566,82 +696,19 @@
<artifactId>lintools-datatypes</artifactId>
<version>1.0.0</version>
</dependency>
<!--START issue #113-->
<!--START pull #321-->
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
<version>2.2</version>
<groupId>com.github.netarchivesuite</groupId>
<artifactId>language-detector</artifactId>
<version>language-detector-0.6a</version>
</dependency>
<!--END pull #321-->
<!--START issue #113-->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.18</version>
</dependency>
<dependency>
<groupId>net.java.dev.jets3t</groupId>
<artifactId>jets3t</artifactId>
<version>0.6.1</version>
</dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.5.2</version>
</dependency>
<dependency>
<groupId>commons-net</groupId>
<artifactId>commons-net</artifactId>
<version>1.4.1</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.1.3</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
<version>1.7.24</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jul-to-slf4j</artifactId>
<version>1.7.24</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.24</version>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
<version>3.0.1</version>
</dependency>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-xml_2.11</artifactId>
<version>1.0.6</version>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.8.11</version>
</dependency>
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>20140107</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.3.1</version>
</dependency>
<dependency>
<groupId>com.thoughtworks.paranamer</groupId>
<artifactId>paranamer</artifactId>
<version>2.8</version>
</dependency>
<dependency>
<groupId>jline</groupId>
<artifactId>jline</artifactId>
Expand All @@ -650,7 +717,17 @@
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-math3</artifactId>
<version>3.2</version>
<version>3.6.1</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.10</version>
</dependency>
<!--END issue #113-->
<!-- for codecov.io -->
Expand Down
8 changes: 6 additions & 2 deletions src/main/scala/io/archivesunleashed/app/CommandLineApp.scala
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,13 @@ class CommandLineApp(conf: CmdAppConf) {

def save(d: Dataset[Row]): Unit = {
if (!configuration.partition.isEmpty) {
d.coalesce(configuration.partition()).write.csv(saveTarget)
d.coalesce(configuration.partition()).write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.csv(saveTarget)
} else {
d.write.csv(saveTarget)
d.write
.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
.csv(saveTarget)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
*/
package io.archivesunleashed.matchbox

import org.apache.tika.language.LanguageIdentifier
import org.apache.tika.langdetect.OptimaizeLangDetector;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.language.detect.LanguageResult;

/** Detects language using Apache Tika. */
object DetectLanguage {
Expand All @@ -30,7 +32,9 @@ object DetectLanguage {
if (input.isEmpty) {
""
} else {
new LanguageIdentifier(input).getLanguage
val detector: LanguageDetector = new OptimaizeLangDetector().loadModels()
val result : LanguageResult = detector.detect(input)
result.getLanguage()
}
}
}
2 changes: 1 addition & 1 deletion src/test/scala/io/archivesunleashed/ArcTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ class ArcTest extends FunSuite with BeforeAndAfter {
.collect

languageCounts.foreach {
case ("en", count) => assert(57L == count)
case ("en", count) => assert(135L == count)
case ("et", count) => assert(6L == count)
case ("it", count) => assert(1L == count)
case ("lt", count) => assert(61L == count)
Expand Down
4 changes: 2 additions & 2 deletions src/test/scala/io/archivesunleashed/RecordRDDTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
val base2 = RecordLoader.loadArchives(arcPath, sc)
.keepValidPages()
val langs: Set[String] = Set("en", "fr")
val r = Array("http://www.archive.org/index.php",
"http://www.archive.org/details/DrinkingWithBob-MadonnaAdoptsAfricanBaby887")
val r = Array("http://www.archive.org/",
"http://www.archive.org/index.php")
val r2 = base2.keepLanguages(langs)
.map(r => r.getUrl).take(2)
assert (r2.sameElements(r))
Expand Down

0 comments on commit 0e701b2

Please sign in to comment.