Skip to content

Commit

Permalink
plugin for gate 8.5.1
Browse files Browse the repository at this point in the history
  • Loading branch information
aaron-sr committed Jun 21, 2018
1 parent b37fcdc commit 43d7442
Show file tree
Hide file tree
Showing 8 changed files with 293 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
/bin
/classes
/.externalToolBuilders
*.jar
/target/
.classpath
.project
/.settings/
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# gateplugin-LanguageDetection
Integrate optimaize/language-detector into GATE
62 changes: 62 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?xml version="1.0"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>uk.ac.gate</groupId>
<artifactId>gate-plugin-base</artifactId>
<!-- The version of GATE you wish to build against -->
<!-- note that this is also the earliest version of GATE which will load
your plugin -->
<version>8.5.1</version>
<!-- This forces the parent to always be resolved from the repo -->
<relativePath />
</parent>

<!-- The Maven coordinates of this plugin -->
<groupId>gate.language-detection</groupId>
<artifactId>language-detection</artifactId>
<version>1.0-SNAPSHOT</version>

<!-- Information about this plugin. This will be used when the plugin is
shown in the GATE plugin manager -->

<!-- Add a descriptive short name of the plugin -->
<name></name>

<!-- Add a description of what the plugin does or can be used for. Do not
include the list of processing or language resources, this will be determined
automatically. About one or two paragraphs. -->
<description></description>

<!-- Add the URL of the plugin's user-oriented web page -->
<url></url>

<licenses>
<license>
<!-- The license used for most GATE plugins and components. Replace with
any other license which is compatible with the licenses of all the software
and libraries this plugin depends on. -->
<name>The Apache Software License, Version 2.0</name>
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
<distribution>repo</distribution>
</license>
</licenses>

<!-- there are plenty of other pieces of useful information you can also
provide here. See the pom.xml reference for full details -->


<dependencies>
<!-- Add the Maven coordinates of all libraries this plugin depends on.
The site https://search.maven.org/ can be used to search for libraries and
copy-paste the XML. Any GATE plugins needed at compile time should use "<scope>provided</scope>" -->
<dependency>
<groupId>com.optimaize.languagedetector</groupId>
<artifactId>language-detector</artifactId>
<version>0.6</version>
</dependency>
</dependencies>
</project>
174 changes: 174 additions & 0 deletions src/main/java/gate/languagedetection/LanguageDetection.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
package gate.languagedetection;

import java.io.IOException;
import java.util.List;

import com.optimaize.langdetect.DetectedLanguage;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;

import gate.Annotation;
import gate.AnnotationSet;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;

/**
* This class is the implementation of the resource LanguageDetection.
*/
@CreoleResource(name = "LanguageDetection", comment = "Integrate optimaize/language-detector (https://github.com/optimaize/language-detector) as a Processing Resource")
public class LanguageDetection extends AbstractLanguageAnalyser {

private static final long serialVersionUID = 4531104124991700665L;

private static final String DETECTEDLANGUAGE_SPLIT = ", ";
private static final String PROBABILITY_SPLIT = ":";

private List<String> languageFilter;

private String featureName;
private String inputASName;
private String inputAnnotation;

private Double threshold;

private LanguageDetector detector;

@Override
public Resource init() throws ResourceInstantiationException {
try {
LanguageProfileReader profileReader = new LanguageProfileReader();
List<LanguageProfile> languageProfiles;
if (languageFilter == null || languageFilter.isEmpty()) {
languageProfiles = profileReader.readAllBuiltIn();
} else {
languageProfiles = profileReader.read(languageFilter);
}
detector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(languageProfiles)
.build();
} catch (IllegalStateException | IOException e) {
throw new ResourceInstantiationException(e);
}
return this;
}

@Override
public void reInit() throws ResourceInstantiationException {
init();
}

@Override
public void execute() throws ExecutionException {
try {
if (isEmpty(inputASName) && isEmpty(inputAnnotation)) {
String text = document.getContent().toString();
FeatureMap featureMap = document.getFeatures();
detectLanguage(text, featureMap);
} else {
AnnotationSet inputAnnotationSet = document.getAnnotations(inputASName);
if (!isEmpty(inputAnnotation)) {
inputAnnotationSet = inputAnnotationSet.get(inputAnnotation);
}
for (Annotation annotation : inputAnnotationSet) {
String text = document.getContent()
.getContent(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset())
.toString();
FeatureMap featureMap = annotation.getFeatures();
detectLanguage(text, featureMap);
}
}
} catch (Exception e) {
throw new ExecutionException(e);
}
}

private void detectLanguage(String text, FeatureMap featureMap) {
List<DetectedLanguage> probabilities = detector.getProbabilities(text);
for (DetectedLanguage detectedLanguage : probabilities) {
if (threshold == null || detectedLanguage.getProbability() >= threshold) {
appendLanguageToFeatureMap(featureMap, detectedLanguage.getLocale().getLanguage(),
detectedLanguage.getProbability());
}
}
}

private void appendLanguageToFeatureMap(FeatureMap featureMap, String language, double probability) {
Object object = document.getFeatures().get(featureName);
if (object != null) {
featureMap.put(featureName, object.toString() + DETECTEDLANGUAGE_SPLIT + asString(language, probability));
} else {
featureMap.put(featureName, asString(language, probability));
}
}

private String asString(String language, double probability) {
return language + PROBABILITY_SPLIT + probability;
}

private boolean isEmpty(String string) {
return string == null || string.length() == 0;
}

public List<String> getLanguageFilter() {
return languageFilter;
}

@Optional
@CreoleParameter(comment = "Only detect following languages")
public void setLanguageFilter(List<String> languageFilter) {
this.languageFilter = languageFilter;
}

public String getFeatureName() {
return featureName;
}

@RunTime
@CreoleParameter(comment = "Name of the feature to store detected language, document or annotation", defaultValue = "lang")
public void setFeatureName(String featureName) {
this.featureName = featureName;
}

public String getInputASName() {
return inputASName;
}

@Optional
@RunTime
@CreoleParameter(comment = "analyse specific annotation instead of whole document")
public void setInputASName(String inputASName) {
this.inputASName = inputASName;
}

public String getInputAnnotation() {
return inputAnnotation;
}

@Optional
@RunTime
@CreoleParameter(comment = "analyse specific annotation instead of whole document")
public void setInputAnnotation(String inputAnnotation) {
this.inputAnnotation = inputAnnotation;
}

@Optional
@RunTime
@CreoleParameter(comment = "Only annotate languages with threshold")
public void setThreshold(Double threshold) {
this.threshold = threshold;
}

public Double getThreshold() {
return threshold;
}

}
8 changes: 8 additions & 0 deletions src/main/resources/creole.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?xml version="1.0"?>
<CREOLE-DIRECTORY>

<!-- plugins can require other plugins.
For example to require ANNIE you would include the following:
<REQUIRES GROUP="uk.ac.gate.plugins" ARTIFACT="annie" VERSION="8.5"/>
-->
</CREOLE-DIRECTORY>
14 changes: 14 additions & 0 deletions src/test/java/gate/languagedetection/GappLoadingTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package gate.languagedetection;

import gate.test.GappLoadingTestCase;

/**
* Using this class automatically tests all pipelines for proper loading.
*
* This class automatically tries to load all pipelines (any file with an
* extension ".gapp" or ".xgapp") which reside
* in the main/resources/resources directory tree.
*/
public class GappLoadingTest extends GappLoadingTestCase {

}
22 changes: 22 additions & 0 deletions src/test/java/gate/languagedetection/TestingClass.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package gate.languagedetection;

import gate.test.GATEPluginTests;
import org.junit.Test;
import static org.junit.Assert.*;


/**
* Using this class automatically prepares GATE and the plugin for testing.
*
* This class automatically initializes GATE and loads the plugin.
* Any method in this class with the "@Test" annotation will then get
* run with the plugin already properly loaded.
*
*/
public class TestingClass extends GATEPluginTests {

@Test
public void testSomething() {
// testing code goes here
}
}
3 changes: 3 additions & 0 deletions src/test/resources/creole.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
groupId=${project.groupId}
artifactId=${project.artifactId}
version=${project.version}

0 comments on commit 43d7442

Please sign in to comment.