diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a37d209 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +/bin +/classes +/.externalToolBuilders +*.jar +/target/ +.classpath +.project +/.settings/ \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a154f42 --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# gateplugin-LanguageDetection +Integrate optimaize/language-detector into GATE \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..b989491 --- /dev/null +++ b/pom.xml @@ -0,0 +1,62 @@ + + + 4.0.0 + + + uk.ac.gate + gate-plugin-base + + + 8.5.1 + + + + + + gate.language-detection + language-detection + 1.0-SNAPSHOT + + + + + + + + + + + + + + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + + + + + + com.optimaize.languagedetector + language-detector + 0.6 + + + diff --git a/src/main/java/gate/languagedetection/LanguageDetection.java b/src/main/java/gate/languagedetection/LanguageDetection.java new file mode 100644 index 0000000..5bc6afb --- /dev/null +++ b/src/main/java/gate/languagedetection/LanguageDetection.java @@ -0,0 +1,174 @@ +package gate.languagedetection; + +import java.io.IOException; +import java.util.List; + +import com.optimaize.langdetect.DetectedLanguage; +import com.optimaize.langdetect.LanguageDetector; +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfile; +import com.optimaize.langdetect.profiles.LanguageProfileReader; + +import gate.Annotation; +import gate.AnnotationSet; +import gate.FeatureMap; +import gate.Resource; +import gate.creole.AbstractLanguageAnalyser; +import gate.creole.ExecutionException; +import gate.creole.ResourceInstantiationException; +import gate.creole.metadata.CreoleParameter; +import gate.creole.metadata.CreoleResource; +import gate.creole.metadata.Optional; +import gate.creole.metadata.RunTime; + +/** + * This class is the implementation of the resource LanguageDetection. + */ +@CreoleResource(name = "LanguageDetection", comment = "Integrate optimaize/language-detector (https://github.com/optimaize/language-detector) as a Processing Resource") +public class LanguageDetection extends AbstractLanguageAnalyser { + + private static final long serialVersionUID = 4531104124991700665L; + + private static final String DETECTEDLANGUAGE_SPLIT = ", "; + private static final String PROBABILITY_SPLIT = ":"; + + private List languageFilter; + + private String featureName; + private String inputASName; + private String inputAnnotation; + + private Double threshold; + + private LanguageDetector detector; + + @Override + public Resource init() throws ResourceInstantiationException { + try { + LanguageProfileReader profileReader = new LanguageProfileReader(); + List languageProfiles; + if (languageFilter == null || languageFilter.isEmpty()) { + languageProfiles = profileReader.readAllBuiltIn(); + } else { + languageProfiles = profileReader.read(languageFilter); + } + detector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(languageProfiles) + .build(); + } catch (IllegalStateException | IOException e) { + throw new ResourceInstantiationException(e); + } + return this; + } + + @Override + public void reInit() throws ResourceInstantiationException { + init(); + } + + @Override + public void execute() throws ExecutionException { + try { + if (isEmpty(inputASName) && isEmpty(inputAnnotation)) { + String text = document.getContent().toString(); + FeatureMap featureMap = document.getFeatures(); + detectLanguage(text, featureMap); + } else { + AnnotationSet inputAnnotationSet = document.getAnnotations(inputASName); + if (!isEmpty(inputAnnotation)) { + inputAnnotationSet = inputAnnotationSet.get(inputAnnotation); + } + for (Annotation annotation : inputAnnotationSet) { + String text = document.getContent() + .getContent(annotation.getStartNode().getOffset(), annotation.getEndNode().getOffset()) + .toString(); + FeatureMap featureMap = annotation.getFeatures(); + detectLanguage(text, featureMap); + } + } + } catch (Exception e) { + throw new ExecutionException(e); + } + } + + private void detectLanguage(String text, FeatureMap featureMap) { + List probabilities = detector.getProbabilities(text); + for (DetectedLanguage detectedLanguage : probabilities) { + if (threshold == null || detectedLanguage.getProbability() >= threshold) { + appendLanguageToFeatureMap(featureMap, detectedLanguage.getLocale().getLanguage(), + detectedLanguage.getProbability()); + } + } + } + + private void appendLanguageToFeatureMap(FeatureMap featureMap, String language, double probability) { + Object object = document.getFeatures().get(featureName); + if (object != null) { + featureMap.put(featureName, object.toString() + DETECTEDLANGUAGE_SPLIT + asString(language, probability)); + } else { + featureMap.put(featureName, asString(language, probability)); + } + } + + private String asString(String language, double probability) { + return language + PROBABILITY_SPLIT + probability; + } + + private boolean isEmpty(String string) { + return string == null || string.length() == 0; + } + + public List getLanguageFilter() { + return languageFilter; + } + + @Optional + @CreoleParameter(comment = "Only detect following languages") + public void setLanguageFilter(List languageFilter) { + this.languageFilter = languageFilter; + } + + public String getFeatureName() { + return featureName; + } + + @RunTime + @CreoleParameter(comment = "Name of the feature to store detected language, document or annotation", defaultValue = "lang") + public void setFeatureName(String featureName) { + this.featureName = featureName; + } + + public String getInputASName() { + return inputASName; + } + + @Optional + @RunTime + @CreoleParameter(comment = "analyse specific annotation instead of whole document") + public void setInputASName(String inputASName) { + this.inputASName = inputASName; + } + + public String getInputAnnotation() { + return inputAnnotation; + } + + @Optional + @RunTime + @CreoleParameter(comment = "analyse specific annotation instead of whole document") + public void setInputAnnotation(String inputAnnotation) { + this.inputAnnotation = inputAnnotation; + } + + @Optional + @RunTime + @CreoleParameter(comment = "Only annotate languages with threshold") + public void setThreshold(Double threshold) { + this.threshold = threshold; + } + + public Double getThreshold() { + return threshold; + } + +} \ No newline at end of file diff --git a/src/main/resources/creole.xml b/src/main/resources/creole.xml new file mode 100644 index 0000000..ebeeccf --- /dev/null +++ b/src/main/resources/creole.xml @@ -0,0 +1,8 @@ + + + + + diff --git a/src/test/java/gate/languagedetection/GappLoadingTest.java b/src/test/java/gate/languagedetection/GappLoadingTest.java new file mode 100644 index 0000000..f6065ca --- /dev/null +++ b/src/test/java/gate/languagedetection/GappLoadingTest.java @@ -0,0 +1,14 @@ +package gate.languagedetection; + +import gate.test.GappLoadingTestCase; + +/** + * Using this class automatically tests all pipelines for proper loading. + * + * This class automatically tries to load all pipelines (any file with an + * extension ".gapp" or ".xgapp") which reside + * in the main/resources/resources directory tree. + */ +public class GappLoadingTest extends GappLoadingTestCase { + +} diff --git a/src/test/java/gate/languagedetection/TestingClass.java b/src/test/java/gate/languagedetection/TestingClass.java new file mode 100644 index 0000000..95bbcdc --- /dev/null +++ b/src/test/java/gate/languagedetection/TestingClass.java @@ -0,0 +1,22 @@ +package gate.languagedetection; + +import gate.test.GATEPluginTests; +import org.junit.Test; +import static org.junit.Assert.*; + + +/** + * Using this class automatically prepares GATE and the plugin for testing. + * + * This class automatically initializes GATE and loads the plugin. + * Any method in this class with the "@Test" annotation will then get + * run with the plugin already properly loaded. + * + */ +public class TestingClass extends GATEPluginTests { + + @Test + public void testSomething() { + // testing code goes here + } +} diff --git a/src/test/resources/creole.properties b/src/test/resources/creole.properties new file mode 100644 index 0000000..951752f --- /dev/null +++ b/src/test/resources/creole.properties @@ -0,0 +1,3 @@ +groupId=${project.groupId} +artifactId=${project.artifactId} +version=${project.version}