-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
293 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
/bin | ||
/classes | ||
/.externalToolBuilders | ||
*.jar | ||
/target/ | ||
.classpath | ||
.project | ||
/.settings/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# gateplugin-LanguageDetection | ||
Integrate optimaize/language-detector into GATE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
<?xml version="1.0"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<parent> | ||
<groupId>uk.ac.gate</groupId> | ||
<artifactId>gate-plugin-base</artifactId> | ||
<!-- The version of GATE you wish to build against --> | ||
<!-- note that this is also the earliest version of GATE which will load | ||
your plugin --> | ||
<version>8.5.1</version> | ||
<!-- This forces the parent to always be resolved from the repo --> | ||
<relativePath /> | ||
</parent> | ||
|
||
<!-- The Maven coordinates of this plugin --> | ||
<groupId>gate.language-detection</groupId> | ||
<artifactId>language-detection</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
|
||
<!-- Information about this plugin. This will be used when the plugin is | ||
shown in the GATE plugin manager --> | ||
|
||
<!-- Add a descriptive short name of the plugin --> | ||
<name></name> | ||
|
||
<!-- Add a description of what the plugin does or can be used for. Do not | ||
include the list of processing or language resources, this will be determined | ||
automatically. About one or two paragraphs. --> | ||
<description></description> | ||
|
||
<!-- Add the URL of the plugin's user-oriented web page --> | ||
<url></url> | ||
|
||
<licenses> | ||
<license> | ||
<!-- The license used for most GATE plugins and components. Replace with | ||
any other license which is compatible with the licenses of all the software | ||
and libraries this plugin depends on. --> | ||
<name>The Apache Software License, Version 2.0</name> | ||
<url>http://www.apache.org/licenses/LICENSE-2.0.txt</url> | ||
<distribution>repo</distribution> | ||
</license> | ||
</licenses> | ||
|
||
<!-- there are plenty of other pieces of useful information you can also | ||
provide here. See the pom.xml reference for full details --> | ||
|
||
|
||
<dependencies> | ||
<!-- Add the Maven coordinates of all libraries this plugin depends on. | ||
The site https://search.maven.org/ can be used to search for libraries and | ||
copy-paste the XML. Any GATE plugins needed at compile time should use "<scope>provided</scope>" --> | ||
<dependency> | ||
<groupId>com.optimaize.languagedetector</groupId> | ||
<artifactId>language-detector</artifactId> | ||
<version>0.6</version> | ||
</dependency> | ||
</dependencies> | ||
</project> |
174 changes: 174 additions & 0 deletions
174
src/main/java/gate/languagedetection/LanguageDetection.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
package gate.languagedetection; | ||
|
||
import java.io.IOException; | ||
import java.util.List; | ||
|
||
import com.optimaize.langdetect.DetectedLanguage; | ||
import com.optimaize.langdetect.LanguageDetector; | ||
import com.optimaize.langdetect.LanguageDetectorBuilder; | ||
import com.optimaize.langdetect.ngram.NgramExtractors; | ||
import com.optimaize.langdetect.profiles.LanguageProfile; | ||
import com.optimaize.langdetect.profiles.LanguageProfileReader; | ||
|
||
import gate.Annotation; | ||
import gate.AnnotationSet; | ||
import gate.FeatureMap; | ||
import gate.Resource; | ||
import gate.creole.AbstractLanguageAnalyser; | ||
import gate.creole.ExecutionException; | ||
import gate.creole.ResourceInstantiationException; | ||
import gate.creole.metadata.CreoleParameter; | ||
import gate.creole.metadata.CreoleResource; | ||
import gate.creole.metadata.Optional; | ||
import gate.creole.metadata.RunTime; | ||
|
||
/** | ||
* This class is the implementation of the resource LanguageDetection. | ||
*/ | ||
@CreoleResource(name = "LanguageDetection", comment = "Integrate optimaize/language-detector (https://github.com/optimaize/language-detector) as a Processing Resource") | ||
public class LanguageDetection extends AbstractLanguageAnalyser { | ||
|
||
private static final long serialVersionUID = 4531104124991700665L; | ||
|
||
private static final String DETECTEDLANGUAGE_SPLIT = ", "; | ||
private static final String PROBABILITY_SPLIT = ":"; | ||
|
||
private List<String> languageFilter; | ||
|
||
private String featureName; | ||
private String inputASName; | ||
private String inputAnnotation; | ||
|
||
private Double threshold; | ||
|
||
private LanguageDetector detector; | ||
|
||
@Override | ||
public Resource init() throws ResourceInstantiationException { | ||
try { | ||
LanguageProfileReader profileReader = new LanguageProfileReader(); | ||
List<LanguageProfile> languageProfiles; | ||
if (languageFilter == null || languageFilter.isEmpty()) { | ||
languageProfiles = profileReader.readAllBuiltIn(); | ||
} else { | ||
languageProfiles = profileReader.read(languageFilter); | ||
} | ||
detector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(languageProfiles) | ||
.build(); | ||
} catch (IllegalStateException | IOException e) { | ||
throw new ResourceInstantiationException(e); | ||
} | ||
return this; | ||
} | ||
|
||
@Override | ||
public void reInit() throws ResourceInstantiationException { | ||
init(); | ||
} | ||
|
||
@Override | ||
public void execute() throws ExecutionException { | ||
try { | ||
if (isEmpty(inputASName) && isEmpty(inputAnnotation)) { | ||
String text = document.getContent().toString(); | ||
FeatureMap featureMap = document.getFeatures(); | ||
detectLanguage(text, featureMap); | ||
} else { | ||
AnnotationSet inputAnnotationSet = document.getAnnotations(inputASName); | ||
if (!isEmpty(inputAnnotation)) { | ||
inputAnnotationSet = inputAnnotationSet.get(inputAnnotation); | ||
} | ||
for (Annotation annotation : inputAnnotationSet) { | ||
String text = document.getContent() | ||
.getContent(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()) | ||
.toString(); | ||
FeatureMap featureMap = annotation.getFeatures(); | ||
detectLanguage(text, featureMap); | ||
} | ||
} | ||
} catch (Exception e) { | ||
throw new ExecutionException(e); | ||
} | ||
} | ||
|
||
private void detectLanguage(String text, FeatureMap featureMap) { | ||
List<DetectedLanguage> probabilities = detector.getProbabilities(text); | ||
for (DetectedLanguage detectedLanguage : probabilities) { | ||
if (threshold == null || detectedLanguage.getProbability() >= threshold) { | ||
appendLanguageToFeatureMap(featureMap, detectedLanguage.getLocale().getLanguage(), | ||
detectedLanguage.getProbability()); | ||
} | ||
} | ||
} | ||
|
||
private void appendLanguageToFeatureMap(FeatureMap featureMap, String language, double probability) { | ||
Object object = document.getFeatures().get(featureName); | ||
if (object != null) { | ||
featureMap.put(featureName, object.toString() + DETECTEDLANGUAGE_SPLIT + asString(language, probability)); | ||
} else { | ||
featureMap.put(featureName, asString(language, probability)); | ||
} | ||
} | ||
|
||
private String asString(String language, double probability) { | ||
return language + PROBABILITY_SPLIT + probability; | ||
} | ||
|
||
private boolean isEmpty(String string) { | ||
return string == null || string.length() == 0; | ||
} | ||
|
||
public List<String> getLanguageFilter() { | ||
return languageFilter; | ||
} | ||
|
||
@Optional | ||
@CreoleParameter(comment = "Only detect following languages") | ||
public void setLanguageFilter(List<String> languageFilter) { | ||
this.languageFilter = languageFilter; | ||
} | ||
|
||
public String getFeatureName() { | ||
return featureName; | ||
} | ||
|
||
@RunTime | ||
@CreoleParameter(comment = "Name of the feature to store detected language, document or annotation", defaultValue = "lang") | ||
public void setFeatureName(String featureName) { | ||
this.featureName = featureName; | ||
} | ||
|
||
public String getInputASName() { | ||
return inputASName; | ||
} | ||
|
||
@Optional | ||
@RunTime | ||
@CreoleParameter(comment = "analyse specific annotation instead of whole document") | ||
public void setInputASName(String inputASName) { | ||
this.inputASName = inputASName; | ||
} | ||
|
||
public String getInputAnnotation() { | ||
return inputAnnotation; | ||
} | ||
|
||
@Optional | ||
@RunTime | ||
@CreoleParameter(comment = "analyse specific annotation instead of whole document") | ||
public void setInputAnnotation(String inputAnnotation) { | ||
this.inputAnnotation = inputAnnotation; | ||
} | ||
|
||
@Optional | ||
@RunTime | ||
@CreoleParameter(comment = "Only annotate languages with threshold") | ||
public void setThreshold(Double threshold) { | ||
this.threshold = threshold; | ||
} | ||
|
||
public Double getThreshold() { | ||
return threshold; | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
<?xml version="1.0"?> | ||
<CREOLE-DIRECTORY> | ||
|
||
<!-- plugins can require other plugins. | ||
For example to require ANNIE you would include the following: | ||
<REQUIRES GROUP="uk.ac.gate.plugins" ARTIFACT="annie" VERSION="8.5"/> | ||
--> | ||
</CREOLE-DIRECTORY> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
package gate.languagedetection; | ||
|
||
import gate.test.GappLoadingTestCase; | ||
|
||
/** | ||
* Using this class automatically tests all pipelines for proper loading. | ||
* | ||
* This class automatically tries to load all pipelines (any file with an | ||
* extension ".gapp" or ".xgapp") which reside | ||
* in the main/resources/resources directory tree. | ||
*/ | ||
public class GappLoadingTest extends GappLoadingTestCase { | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
package gate.languagedetection; | ||
|
||
import gate.test.GATEPluginTests; | ||
import org.junit.Test; | ||
import static org.junit.Assert.*; | ||
|
||
|
||
/** | ||
* Using this class automatically prepares GATE and the plugin for testing. | ||
* | ||
* This class automatically initializes GATE and loads the plugin. | ||
* Any method in this class with the "@Test" annotation will then get | ||
* run with the plugin already properly loaded. | ||
* | ||
*/ | ||
public class TestingClass extends GATEPluginTests { | ||
|
||
@Test | ||
public void testSomething() { | ||
// testing code goes here | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
groupId=${project.groupId} | ||
artifactId=${project.artifactId} | ||
version=${project.version} |