From 84292c946f41f37c96154d07b3f2dc36630fa354 Mon Sep 17 00:00:00 2001 From: Alon Eirew Date: Mon, 8 Mar 2021 11:56:38 +0200 Subject: [PATCH] finalizing process (before testing) --- README.md | 1 + build.gradle | 1 + src/main/java/wec/ExtractWECToJson.java | 98 +++++++++++-------- src/main/java/wec/config/Configuration.java | 6 ++ src/main/java/wec/data/WECMention.java | 2 +- .../java/wec/persistence/DBRepository.java | 11 ++- src/main/java/wec/utils/StanfordNlpApi.java | 4 + src/main/resources/application.properties | 3 +- 8 files changed, 83 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index 01b3098..b93b021 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ totalAmountToExtract=-1 => if < 0 then read all wikipedia pages, otherwise will ``` main.outputDir=output => the output folder where the WEC json should be created main.outputFile=GenWEC.json => the output file name of WEC json file +main.lexicalThresh=4 => lexical diversity threshold ``` ### Language Adaptation diff --git a/build.gradle b/build.gradle index 3460362..bd38c7e 100644 --- a/build.gradle +++ b/build.gradle @@ -39,6 +39,7 @@ dependencies { implementation group: 'org.xerial', name: 'sqlite-jdbc', version: '3.27.2.1' implementation group: 'info.bliki.wiki', name: 'bliki-core', version: '3.1.0' implementation group: 'org.jsoup', name: 'jsoup', version: '1.13.1' + implementation group: 'me.tongfei', name: 'progressbar', version: '0.9.0' // Spring.io runtimeOnly 'com.h2database:h2' diff --git a/src/main/java/wec/ExtractWECToJson.java b/src/main/java/wec/ExtractWECToJson.java index 1f274c1..e2475e4 100644 --- a/src/main/java/wec/ExtractWECToJson.java +++ b/src/main/java/wec/ExtractWECToJson.java @@ -5,32 +5,28 @@ import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.reflect.TypeToken; +import com.google.gson.stream.JsonWriter; +import me.tongfei.progressbar.ProgressBar; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; -import org.springframework.transaction.annotation.Transactional; import wec.config.Configuration; +import wec.data.WECContext; import wec.data.WECCoref; import wec.data.WECMention; -import wec.persistence.MentionsRepository; +import wec.persistence.WECResources; +import wec.utils.StanfordNlpApi; import java.io.File; import java.io.FileWriter; import java.io.IOException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.IntStream; @Component -@Transactional public class ExtractWECToJson { private final static Logger LOGGER = LogManager.getLogger(ExtractWECToJson.class); - @Autowired private MentionsRepository mentionsRepository; - public void generateJson() throws IOException { String jsonOutputDir = Configuration.getConfiguration().getJsonOutputDir(); File folder = new File(jsonOutputDir); @@ -42,43 +38,67 @@ public void generateJson() throws IOException { } String jsonOutputFile = jsonOutputDir + File.separator + Configuration.getConfiguration().getJsonOutputFile(); - try(FileWriter fw = new FileWriter(jsonOutputFile)) { - Iterable mergedCorefMentions = mentionsRepository.findAll(); - JsonArray corefs = CleanAndConvertToJson(mergedCorefMentions); - Configuration.GSONPretty.toJson(corefs, fw); + StanfordNlpApi.getPipelineWithPos(); + try(JsonWriter writer = new JsonWriter(new FileWriter(jsonOutputFile))) { + Iterable mergedCorefMentions = WECResources.getDbRepository().findAllMentions(); + CleanAndWriteToJson(mergedCorefMentions, writer); } LOGGER.info("process complete!"); } - private JsonArray CleanAndConvertToJson(Iterable wecMentions) { + private void CleanAndWriteToJson(Iterable wecMentions, JsonWriter writer) throws IOException { int sizeBefore = Iterables.size(wecMentions); Iterator iterator = wecMentions.iterator(); LOGGER.info("Total Mentions Extracted=" + sizeBefore); int contextRemove = 0; int nerRemoved = 0; - JsonArray root = new JsonArray(); + int lexicalRemove = 0; Map> lexicalDiversity = new HashMap<>(); - while(iterator.hasNext()) { - WECMention wecMention = iterator.next(); - if (!isContextValid(wecMention) || !fillAndCheckIsMentionValid(wecMention)) { - iterator.remove(); - contextRemove++; - continue; - } - - if(!lexicalDiversity.containsKey(wecMention.getCorefChain().getCorefId())) { - lexicalDiversity.put(wecMention.getCorefChain().getCorefId(), new HashMap<>()); + writer.setIndent("\t"); + writer.beginArray(); + try (ProgressBar pb = new ProgressBar("Processing", sizeBefore)) { + while (iterator.hasNext()) { + pb.step(); + WECMention wecMention = iterator.next(); + if(!fillAndCheckIsMentionValid(wecMention)) { + iterator.remove(); + nerRemoved++; + continue; + } + + Optional retContext = WECResources.getDbRepository().findContextById(wecMention.getContextId()); + if (retContext.isEmpty() || !isContextValid(retContext.get())) { + iterator.remove(); + contextRemove++; + continue; + } + + if (!lexicalDiversity.containsKey(wecMention.getCorefChain().getCorefId())) { + lexicalDiversity.put(wecMention.getCorefChain().getCorefId(), new HashMap<>()); + } + + if(!lexicalDiversity.get(wecMention.getCorefChain().getCorefId()).containsKey(wecMention.getMentionText())) { + lexicalDiversity.get(wecMention.getCorefChain().getCorefId()).put(wecMention.getMentionText(), 0); + } + + if(lexicalDiversity.get(wecMention.getCorefChain().getCorefId()).get(wecMention.getMentionText()) >= + Configuration.getConfiguration().getLexicalThresh()) { + iterator.remove(); + lexicalRemove++; + continue; + } + + Configuration.GSON.toJson(convertMentionToJson(wecMention, retContext.get()), writer); } - - JsonObject jsonObject = convertMentionToJson(wecMention); - root.add(jsonObject); } + writer.endArray(); + writer.close(); + LOGGER.info("Total of " + contextRemove + " mentions with problematic context"); LOGGER.info("Total of " + nerRemoved + " mentions with suspicious NER removed"); + LOGGER.info("Total of " + lexicalRemove + " didn't pass lexical threshold"); LOGGER.info("Mentions remaining=" + Iterables.size(wecMentions)); - - return root; } private boolean fillAndCheckIsMentionValid(WECMention mention) { @@ -88,14 +108,13 @@ private boolean fillAndCheckIsMentionValid(WECMention mention) { !mentionNer.equals("NATIONALITY"); } - private boolean isContextValid(WECMention mention) { -// List contextAsList = mention.getContextId().getContextAsArray(); -// String contextAsString = String.join(" ", contextAsList); -// return !contextAsString.contains("colspan") && !contextAsString.contains("http"); - return true; + private boolean isContextValid(WECContext context) { + List contextAsArray = context.getContextAsArray(); + String contextAsString = String.join(" ", contextAsArray); + return !contextAsString.contains("colspan") && !contextAsString.contains("http"); } - private JsonObject convertMentionToJson(WECMention mention) { + private JsonObject convertMentionToJson(WECMention mention, WECContext context) { WECCoref coref = mention.getCorefChain(); JsonObject jo = new JsonObject(); jo.addProperty("coref_chain", mention.getCorefChain().getCorefId()); @@ -113,9 +132,8 @@ private JsonObject convertMentionToJson(WECMention mention) { IntStream.range(mention.getTokenStart(), mention.getTokenEnd() + 1).forEachOrdered(tokNum::add); jo.add("tokens_number", tokNum); -// JsonElement element = Configuration.GSON.toJsonTree(mention.getContextId().getContextAsArray(), -// new TypeToken>() {}.getType()); -// jo.add("mention_context", element.getAsJsonArray()); + JsonElement element = Configuration.GSON.toJsonTree(context.getContextAsArray(), new TypeToken>() {}.getType()); + jo.add("mention_context", element.getAsJsonArray()); return jo; } diff --git a/src/main/java/wec/config/Configuration.java b/src/main/java/wec/config/Configuration.java index cbf06a4..ae807b5 100644 --- a/src/main/java/wec/config/Configuration.java +++ b/src/main/java/wec/config/Configuration.java @@ -24,6 +24,7 @@ public class Configuration { private final int totalAmountToExtract; private final String jsonOutputDir; private final String jsonOutputFile; + private final int lexicalThresh; private final InfoboxConfiguration infoboxConfiguration; private Configuration(Environment environment) { @@ -37,6 +38,7 @@ private Configuration(Environment environment) { this.totalAmountToExtract = Integer.parseInt(Objects.requireNonNull(environment.getProperty("main.totalAmountToExtract", "1000"))); this.jsonOutputDir = environment.getProperty("main.outputDir", "output"); this.jsonOutputFile = environment.getProperty("main.outputFile", "GenWEC.json"); + this.lexicalThresh = Integer.parseInt(Objects.requireNonNull(environment.getProperty("main.lexicalThresh", "4"))); InputStream inputStreamConfigFile = Objects.requireNonNull(Configuration.class.getClassLoader() .getResourceAsStream(this.infoboxConfigurationFile)); @@ -96,4 +98,8 @@ public String getJsonOutputDir() { public String getJsonOutputFile() { return jsonOutputFile; } + + public int getLexicalThresh() { + return lexicalThresh; + } } diff --git a/src/main/java/wec/data/WECMention.java b/src/main/java/wec/data/WECMention.java index 0d2d97f..f86ad5e 100644 --- a/src/main/java/wec/data/WECMention.java +++ b/src/main/java/wec/data/WECMention.java @@ -12,7 +12,7 @@ @Table(name = "MENTIONS") public class WECMention extends BaseMention { - @ManyToOne(fetch = FetchType.LAZY) + @ManyToOne(fetch = FetchType.EAGER) private WECCoref corefChain; private String mentionText; diff --git a/src/main/java/wec/persistence/DBRepository.java b/src/main/java/wec/persistence/DBRepository.java index 5ca3c95..2dc6d85 100644 --- a/src/main/java/wec/persistence/DBRepository.java +++ b/src/main/java/wec/persistence/DBRepository.java @@ -12,13 +12,14 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.Optional; @Repository @Transactional public class DBRepository { @Autowired private ContextRepository contextRepository; @Autowired private CorefRepository corefRepository; - @Autowired private CorefRepository mentionRepository; + @Autowired private MentionsRepository mentionRepository; private static final Logger LOGGER = LoggerFactory.getLogger(DBRepository.class); @@ -45,4 +46,12 @@ public void saveCorefAndMentions(Collection corefs) { this.corefRepository.saveAll(toPersist); LOGGER.info(toPersist.size() + " corefs committed to database"); } + + public Iterable findAllMentions() { + return this.mentionRepository.findAll(); + } + + public Optional findContextById(long contextId) { + return this.contextRepository.findById(contextId); + } } diff --git a/src/main/java/wec/utils/StanfordNlpApi.java b/src/main/java/wec/utils/StanfordNlpApi.java index b43f010..c796e47 100644 --- a/src/main/java/wec/utils/StanfordNlpApi.java +++ b/src/main/java/wec/utils/StanfordNlpApi.java @@ -25,4 +25,8 @@ public static CoreDocument withPosAnnotate(String context) { } return null; } + + public static StanfordCoreNLP getPipelineWithPos() { + return pipelineWithPos; + } } diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index d500ba3..79a143d 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -5,7 +5,7 @@ spring.datasource.driverClassName=org.h2.Driver spring.jpa.database-platform=org.hibernate.dialect.H2Dialect spring.jpa.hibernate.ddl-auto=update spring.jpa.properties.hibernate.jdbc.batch_size=20 -spring.jpa.show-sql=true +spring.jpa.show-sql=false spring.datasource.url=jdbc:h2:file:/Users/aeirew/workspace/DataBase/demo1 #spring.datasource.initialization-mode=always @@ -22,3 +22,4 @@ main.totalAmountToExtract=-1 # runWecJson Configurations main.outputDir=output main.outputFile=GenWEC.json +main.lexicalThresh=4