Skip to content

Commit

Permalink
finalizing process (before testing)
Browse files Browse the repository at this point in the history
  • Loading branch information
Alon Eirew committed Mar 8, 2021
1 parent 8585a30 commit 84292c9
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 43 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ totalAmountToExtract=-1 => if < 0 then read all wikipedia pages, otherwise will
```
main.outputDir=output => the output folder where the WEC json should be created
main.outputFile=GenWEC.json => the output file name of WEC json file
main.lexicalThresh=4 => lexical diversity threshold
```

### Language Adaptation
Expand Down
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ dependencies {
implementation group: 'org.xerial', name: 'sqlite-jdbc', version: '3.27.2.1'
implementation group: 'info.bliki.wiki', name: 'bliki-core', version: '3.1.0'
implementation group: 'org.jsoup', name: 'jsoup', version: '1.13.1'
implementation group: 'me.tongfei', name: 'progressbar', version: '0.9.0'

// Spring.io
runtimeOnly 'com.h2database:h2'
Expand Down
98 changes: 58 additions & 40 deletions src/main/java/wec/ExtractWECToJson.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,32 +5,28 @@
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.reflect.TypeToken;
import com.google.gson.stream.JsonWriter;
import me.tongfei.progressbar.ProgressBar;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import wec.config.Configuration;
import wec.data.WECContext;
import wec.data.WECCoref;
import wec.data.WECMention;
import wec.persistence.MentionsRepository;
import wec.persistence.WECResources;
import wec.utils.StanfordNlpApi;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.stream.IntStream;

@Component
@Transactional
public class ExtractWECToJson {
private final static Logger LOGGER = LogManager.getLogger(ExtractWECToJson.class);

@Autowired private MentionsRepository mentionsRepository;

public void generateJson() throws IOException {
String jsonOutputDir = Configuration.getConfiguration().getJsonOutputDir();
File folder = new File(jsonOutputDir);
Expand All @@ -42,43 +38,67 @@ public void generateJson() throws IOException {
}

String jsonOutputFile = jsonOutputDir + File.separator + Configuration.getConfiguration().getJsonOutputFile();
try(FileWriter fw = new FileWriter(jsonOutputFile)) {
Iterable<WECMention> mergedCorefMentions = mentionsRepository.findAll();
JsonArray corefs = CleanAndConvertToJson(mergedCorefMentions);
Configuration.GSONPretty.toJson(corefs, fw);
StanfordNlpApi.getPipelineWithPos();
try(JsonWriter writer = new JsonWriter(new FileWriter(jsonOutputFile))) {
Iterable<WECMention> mergedCorefMentions = WECResources.getDbRepository().findAllMentions();
CleanAndWriteToJson(mergedCorefMentions, writer);
}
LOGGER.info("process complete!");
}

private JsonArray CleanAndConvertToJson(Iterable<WECMention> wecMentions) {
private void CleanAndWriteToJson(Iterable<WECMention> wecMentions, JsonWriter writer) throws IOException {
int sizeBefore = Iterables.size(wecMentions);
Iterator<WECMention> iterator = wecMentions.iterator();
LOGGER.info("Total Mentions Extracted=" + sizeBefore);
int contextRemove = 0;
int nerRemoved = 0;
JsonArray root = new JsonArray();
int lexicalRemove = 0;
Map<Long, Map<String, Integer>> lexicalDiversity = new HashMap<>();
while(iterator.hasNext()) {
WECMention wecMention = iterator.next();
if (!isContextValid(wecMention) || !fillAndCheckIsMentionValid(wecMention)) {
iterator.remove();
contextRemove++;
continue;
}

if(!lexicalDiversity.containsKey(wecMention.getCorefChain().getCorefId())) {
lexicalDiversity.put(wecMention.getCorefChain().getCorefId(), new HashMap<>());
writer.setIndent("\t");
writer.beginArray();
try (ProgressBar pb = new ProgressBar("Processing", sizeBefore)) {
while (iterator.hasNext()) {
pb.step();
WECMention wecMention = iterator.next();
if(!fillAndCheckIsMentionValid(wecMention)) {
iterator.remove();
nerRemoved++;
continue;
}

Optional<WECContext> retContext = WECResources.getDbRepository().findContextById(wecMention.getContextId());
if (retContext.isEmpty() || !isContextValid(retContext.get())) {
iterator.remove();
contextRemove++;
continue;
}

if (!lexicalDiversity.containsKey(wecMention.getCorefChain().getCorefId())) {
lexicalDiversity.put(wecMention.getCorefChain().getCorefId(), new HashMap<>());
}

if(!lexicalDiversity.get(wecMention.getCorefChain().getCorefId()).containsKey(wecMention.getMentionText())) {
lexicalDiversity.get(wecMention.getCorefChain().getCorefId()).put(wecMention.getMentionText(), 0);
}

if(lexicalDiversity.get(wecMention.getCorefChain().getCorefId()).get(wecMention.getMentionText()) >=
Configuration.getConfiguration().getLexicalThresh()) {
iterator.remove();
lexicalRemove++;
continue;
}

Configuration.GSON.toJson(convertMentionToJson(wecMention, retContext.get()), writer);
}

JsonObject jsonObject = convertMentionToJson(wecMention);
root.add(jsonObject);
}

writer.endArray();
writer.close();

LOGGER.info("Total of " + contextRemove + " mentions with problematic context");
LOGGER.info("Total of " + nerRemoved + " mentions with suspicious NER removed");
LOGGER.info("Total of " + lexicalRemove + " didn't pass lexical threshold");
LOGGER.info("Mentions remaining=" + Iterables.size(wecMentions));

return root;
}

private boolean fillAndCheckIsMentionValid(WECMention mention) {
Expand All @@ -88,14 +108,13 @@ private boolean fillAndCheckIsMentionValid(WECMention mention) {
!mentionNer.equals("NATIONALITY");
}

private boolean isContextValid(WECMention mention) {
// List<String> contextAsList = mention.getContextId().getContextAsArray();
// String contextAsString = String.join(" ", contextAsList);
// return !contextAsString.contains("colspan") && !contextAsString.contains("http");
return true;
private boolean isContextValid(WECContext context) {
List<String> contextAsArray = context.getContextAsArray();
String contextAsString = String.join(" ", contextAsArray);
return !contextAsString.contains("colspan") && !contextAsString.contains("http");
}

private JsonObject convertMentionToJson(WECMention mention) {
private JsonObject convertMentionToJson(WECMention mention, WECContext context) {
WECCoref coref = mention.getCorefChain();
JsonObject jo = new JsonObject();
jo.addProperty("coref_chain", mention.getCorefChain().getCorefId());
Expand All @@ -113,9 +132,8 @@ private JsonObject convertMentionToJson(WECMention mention) {
IntStream.range(mention.getTokenStart(), mention.getTokenEnd() + 1).forEachOrdered(tokNum::add);
jo.add("tokens_number", tokNum);

// JsonElement element = Configuration.GSON.toJsonTree(mention.getContextId().getContextAsArray(),
// new TypeToken<List<String>>() {}.getType());
// jo.add("mention_context", element.getAsJsonArray());
JsonElement element = Configuration.GSON.toJsonTree(context.getContextAsArray(), new TypeToken<List<String>>() {}.getType());
jo.add("mention_context", element.getAsJsonArray());

return jo;
}
Expand Down
6 changes: 6 additions & 0 deletions src/main/java/wec/config/Configuration.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ public class Configuration {
private final int totalAmountToExtract;
private final String jsonOutputDir;
private final String jsonOutputFile;
private final int lexicalThresh;
private final InfoboxConfiguration infoboxConfiguration;

private Configuration(Environment environment) {
Expand All @@ -37,6 +38,7 @@ private Configuration(Environment environment) {
this.totalAmountToExtract = Integer.parseInt(Objects.requireNonNull(environment.getProperty("main.totalAmountToExtract", "1000")));
this.jsonOutputDir = environment.getProperty("main.outputDir", "output");
this.jsonOutputFile = environment.getProperty("main.outputFile", "GenWEC.json");
this.lexicalThresh = Integer.parseInt(Objects.requireNonNull(environment.getProperty("main.lexicalThresh", "4")));

InputStream inputStreamConfigFile = Objects.requireNonNull(Configuration.class.getClassLoader()
.getResourceAsStream(this.infoboxConfigurationFile));
Expand Down Expand Up @@ -96,4 +98,8 @@ public String getJsonOutputDir() {
public String getJsonOutputFile() {
return jsonOutputFile;
}

public int getLexicalThresh() {
return lexicalThresh;
}
}
2 changes: 1 addition & 1 deletion src/main/java/wec/data/WECMention.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
@Table(name = "MENTIONS")
public class WECMention extends BaseMention {

@ManyToOne(fetch = FetchType.LAZY)
@ManyToOne(fetch = FetchType.EAGER)
private WECCoref corefChain;
private String mentionText;

Expand Down
11 changes: 10 additions & 1 deletion src/main/java/wec/persistence/DBRepository.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,14 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Optional;

@Repository
@Transactional
public class DBRepository {
@Autowired private ContextRepository contextRepository;
@Autowired private CorefRepository corefRepository;
@Autowired private CorefRepository mentionRepository;
@Autowired private MentionsRepository mentionRepository;

private static final Logger LOGGER = LoggerFactory.getLogger(DBRepository.class);

Expand All @@ -45,4 +46,12 @@ public void saveCorefAndMentions(Collection<WECCoref> corefs) {
this.corefRepository.saveAll(toPersist);
LOGGER.info(toPersist.size() + " corefs committed to database");
}

public Iterable<WECMention> findAllMentions() {
return this.mentionRepository.findAll();
}

public Optional<WECContext> findContextById(long contextId) {
return this.contextRepository.findById(contextId);
}
}
4 changes: 4 additions & 0 deletions src/main/java/wec/utils/StanfordNlpApi.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,8 @@ public static CoreDocument withPosAnnotate(String context) {
}
return null;
}

public static StanfordCoreNLP getPipelineWithPos() {
return pipelineWithPos;
}
}
3 changes: 2 additions & 1 deletion src/main/resources/application.properties
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ spring.datasource.driverClassName=org.h2.Driver
spring.jpa.database-platform=org.hibernate.dialect.H2Dialect
spring.jpa.hibernate.ddl-auto=update
spring.jpa.properties.hibernate.jdbc.batch_size=20
spring.jpa.show-sql=true
spring.jpa.show-sql=false
spring.datasource.url=jdbc:h2:file:/Users/aeirew/workspace/DataBase/demo1
#spring.datasource.initialization-mode=always

Expand All @@ -22,3 +22,4 @@ main.totalAmountToExtract=-1
# runWecJson Configurations
main.outputDir=output
main.outputFile=GenWEC.json
main.lexicalThresh=4

0 comments on commit 84292c9

Please sign in to comment.