Skip to content

Commit

Permalink
integrate infobox change
Browse files Browse the repository at this point in the history
  • Loading branch information
Alon Eirew committed Aug 10, 2020
1 parent 569e81c commit efb0525
Show file tree
Hide file tree
Showing 31 changed files with 392 additions and 340 deletions.
3 changes: 2 additions & 1 deletion infobox_config/en_infobox_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
"aircraftincident",
"aircraftoccurrence",
"railaccident",
"busaccident"
"busaccident",
"publictransitaccident"
]
},
{
Expand Down
5 changes: 3 additions & 2 deletions src/main/java/data/InfoboxConfiguration.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ public void setInfoboxConfigs(List<InfoboxConfig> infoboxConfigs) {
private DefaultInfoboxExtractor initExtractorAndGet(InfoboxConfig locConfig) {
DefaultInfoboxExtractor extractor = locConfig.getExtractor();
if (extractor == null) {
Pattern pattern = Pattern.compile("\\{\\{" + this.infoboxLangText.toLowerCase() +
"[\\w|]*?(" + String.join("|", locConfig.getInfoboxs()) + ")");
String regex = "\\{\\{" + this.infoboxLangText.toLowerCase() +
"[\\w|]*?(" + String.join("|", locConfig.getInfoboxs()) + ")";
Pattern pattern = Pattern.compile(regex);
if (locConfig.getUseExtractorClass() != null && !locConfig.getUseExtractorClass().isEmpty()) {
try {
Constructor<?>[] constructors = Class.forName(locConfig.getUseExtractorClass()).getConstructors();
Expand Down
12 changes: 9 additions & 3 deletions src/main/java/data/RawElasticResult.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@ public class RawElasticResult {
private String id = "-1";
private final String title;
private final String text;
private final String infobox;

public RawElasticResult(String title, String text) {
public RawElasticResult(String title, String text, String infobox) {
this.title = title;
this.text = text;
this.infobox = infobox;
}

public RawElasticResult(String id, String title, String text) {
this(title, text);
public RawElasticResult(String id, String title, String text, String infobox) {
this(title, text, infobox);
this.id = id;
}

Expand All @@ -27,4 +29,8 @@ public String getTitle() {
public String getText() {
return text;
}

public String getInfobox() {
return infobox;
}
}
15 changes: 11 additions & 4 deletions src/main/java/persistence/ElasticQueryApi.java
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,21 @@ private Map<String, RawElasticResult> onResponse(MultiSearchResponse response) {
public static RawElasticResult extractFromHit(SearchHit hit) {
final String id = hit.getId();
final Map map = hit.getSourceAsMap();
final String text = (String)map.get("text");
final String title = (String)map.get("title");
final String text = (String) map.get("text");
final String title = (String) map.get("title");
final String infobox = (String) map.get("infobox");
final String redirect = (String) map.get("redirectTitle");
final boolean isDisambig = (Boolean) map.get("relations.isDisambiguation");

if(text.toLowerCase().startsWith("#redirect")) {
if(redirect != null && !redirect.isEmpty()) {
return null;
}

return new RawElasticResult(id, title, text);
if (isDisambig) {
return null;
}

return new RawElasticResult(id, title, text, infobox);
}

public List<RawElasticResult> getNextScrollResults(SearchHit[] searchHits) {
Expand Down
39 changes: 2 additions & 37 deletions src/main/java/wec/InfoboxFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,9 @@ public InfoboxFilter(InfoboxConfiguration infoboxConfiguration) {
@Override
public boolean isConditionMet(RawElasticResult result) {
if (result != null && result.getText() != null && !result.getText().isEmpty()) {
final String infoBox = this.extractPageInfoBox(result.getText());
if (infoBox != null && !infoBox.isEmpty()) {
if (result.getInfobox() != null && !result.getInfobox().isEmpty()) {
for (DefaultInfoboxExtractor extractor : this.infoboxConfiguration.getAllIncludedExtractor()) {
final String extractMatchedInfobox = extractor.extractMatchedInfobox(result.getText(), result.getTitle());
final String extractMatchedInfobox = extractor.extractMatchedInfobox(result.getInfobox(), result.getTitle());
final String corefType = extractor.getCorefType();

if (!extractMatchedInfobox.equals(DefaultInfoboxExtractor.NA)) {
Expand All @@ -38,38 +37,4 @@ public boolean isConditionMet(RawElasticResult result) {

return false;
}

public String extractPageInfoBox(String pageText) {
return extractPageInfoBox(pageText, false);
}

public String extractPageInfoBox(String pageText, boolean toLowerForm) {
StringBuilder infoBoxFinal = new StringBuilder();

final int beginIndex = pageText.indexOf("{{" + infoboxConfiguration.getInfoboxLangText());
if (beginIndex != -1) {
final String infoboxSubstring = pageText.substring(beginIndex);
int infoBarCount = 0;
for (int i = 0; i < infoboxSubstring.length(); i++) {
final char c = infoboxSubstring.charAt(i);
if (c == '}') {
infoBarCount--;
if (infoBarCount == 0) {
infoBoxFinal.append(c);
break;
}
} else if (c == '{') {
infoBarCount++;
}

infoBoxFinal.append(c);
}
}

if(toLowerForm) {
return infoBoxFinal.toString().toLowerCase().replaceAll(" ", "");
}

return infoBoxFinal.toString();
}
}
2 changes: 1 addition & 1 deletion src/main/java/wec/extractors/CompanyInfoboxExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

public class CompanyInfoboxExtractor extends DefaultInfoboxExtractor {

private static final int MAX_EMPLOYEES = 1000;
private static final int MAX_EMPLOYEES = 1700;

public CompanyInfoboxExtractor(String corefType, Pattern pattern) {
super(corefType, pattern);
Expand Down
21 changes: 13 additions & 8 deletions src/main/java/wec/extractors/TimeSpan1MonthInfoboxExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

public class TimeSpan1MonthInfoboxExtractor extends DefaultInfoboxExtractor {

private static final Pattern datePattern = Pattern.compile("\\|[\\s\\t]*date[\\s\\t]*=");

public TimeSpan1MonthInfoboxExtractor(String corefType, Pattern pattern) {
super(corefType, pattern);
}
Expand Down Expand Up @@ -46,14 +48,17 @@ public boolean isSpanSingleMonth(String infoBox) {

public static String extractDateLine(String infoBox) {
String date = "";
if(infoBox != null && infoBox.contains("date")) {
String dateSubStr = infoBox.substring(infoBox.indexOf("date"));
if(dateSubStr.contains("<br>")) {
date = dateSubStr.substring(0, dateSubStr.indexOf("<br>"));
} else if(dateSubStr.contains("\n")) {
date = dateSubStr.substring(0, dateSubStr.indexOf("\n"));
} else {
date = dateSubStr;
if(infoBox != null) {
Matcher match = datePattern.matcher(infoBox);
if(match.find()) {
String dateSubStr = infoBox.substring(match.start());
if (dateSubStr.contains("<br>")) {
date = dateSubStr.substring(0, dateSubStr.indexOf("<br>"));
} else if (dateSubStr.contains("\n")) {
date = dateSubStr.substring(0, dateSubStr.indexOf("\n"));
} else {
date = dateSubStr;
}
}
}

Expand Down
7 changes: 3 additions & 4 deletions src/main/java/workers/ParseAndExtractMentionsWorker.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,9 @@ public ParseAndExtractMentionsWorker(List<RawElasticResult> rawElasticResults, S
public void run() {
List<WECMention> finalToCommit = new ArrayList<>();
LOGGER.info("Parsing the wikipedia pages and extracting mentions");
for(RawElasticResult rowResult : this.rawElasticResults) {
String infobox = filter.extractPageInfoBox(rowResult.getText());
if(infobox != null && !infobox.isEmpty()) {
List<WECMention> wecMentions = WECLinksExtractor.extractFromWikipedia(rowResult);
for(RawElasticResult rawResult : this.rawElasticResults) {
if(rawResult.getInfobox() != null && !rawResult.getInfobox().isEmpty()) {
List<WECMention> wecMentions = WECLinksExtractor.extractFromWikipedia(rawResult);
if (!wecMentions.isEmpty()) {
finalToCommit.addAll(wecMentions);
}
Expand Down
14 changes: 5 additions & 9 deletions src/main/java/workers/ReadDateWorker.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,23 @@ public ReadDateWorker(List<RawElasticResult> rawElasticResults, List<String> dat
@Override
public void run() {
for(RawElasticResult rawResult : this.rawElasticResults) {
String date = extractDate(rawResult.getText(), rawResult.getTitle());
String date = extractDate(rawResult);
if(date != null && !date.isEmpty()) {
this.datesSchemas.add(date);
}
}
}

private String extractDate(String text, String title) {
InfoboxFilter filter = new InfoboxFilter(new InfoboxConfiguration());
String infoBox = filter.extractPageInfoBox(text);
DefaultInfoboxExtractor attack = new TimeSpan1MonthInfoboxExtractor(null, null);

String infoboxLow = infoBox.toLowerCase().replaceAll(" ", "");
private String extractDate(RawElasticResult rawResult) {
String infoboxLow = rawResult.getInfobox().toLowerCase().replaceAll(" ", "");
if (infoboxLow.contains("{{infoboxcivilianattack") || infoboxLow.contains("{{infoboxterroristattack") ||
infoboxLow.contains("{{infoboxmilitaryattack") || infoboxLow.contains("{{infoboxcivilconflict") ||
infoboxLow.contains("{{infoboxmilitaryconflict")) {
String dateline = TimeSpan1MonthInfoboxExtractor.extractDateLine(infoBox);
String dateline = TimeSpan1MonthInfoboxExtractor.extractDateLine(rawResult.getInfobox());
// String dateString = attack.extractDateString(dateline);

if (!dateline.isEmpty()) {// && !dateString.isEmpty()) {
return dateline + " => " + title;
return dateline + " => " + rawResult.getTitle();
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/workers/ReadInfoBoxWorker.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ public ReadInfoBoxWorker(List<RawElasticResult> rawElasticResults, Map<String, S
@Override
public void run() {
for(RawElasticResult rawResult : this.rawElasticResults) {
String infoBox = filter.extractPageInfoBox(rawResult.getText(), true);
String infoBox = rawResult.getInfobox();
if(infoBox != null && !infoBox.isEmpty()) {
infoBox = toReadableString(infoBox);

Expand Down
13 changes: 6 additions & 7 deletions src/test/java/wec/TestTimeExtraction.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package wec;

import data.InfoboxConfiguration;
import data.RawElasticResult;
import org.apache.commons.io.IOUtils;
import org.junit.Assert;
import org.junit.Test;
Expand Down Expand Up @@ -42,17 +43,15 @@ public void testIsNotSpanSingleMonth() throws IOException {

@Test
public void testExtractDateFromInfobox() {
InfoboxFilter filter = new InfoboxFilter(new InfoboxConfiguration());
TimeSpan1MonthInfoboxExtractor extractor = new TimeSpan1MonthInfoboxExtractor(null, null);
final List<AbstractMap.SimpleEntry<String, String>> sportText = getTimeFullPages();
for(AbstractMap.SimpleEntry<String, String> text : sportText) {
final String infoBox = filter.extractPageInfoBox(text.getValue());
boolean spanSingleMonth = extractor.isSpanSingleMonth(infoBox);
Assert.assertFalse(text.getKey(), spanSingleMonth);
final List<RawElasticResult> sportText = getTimeFullPages();
for(RawElasticResult text : sportText) {
boolean spanSingleMonth = extractor.isSpanSingleMonth(text.getInfobox());
Assert.assertFalse(text.getTitle(), spanSingleMonth);
}
}

private List<AbstractMap.SimpleEntry<String, String>> getTimeFullPages() {
private List<RawElasticResult> getTimeFullPages() {
return TestUtils.getTextAndTitle("time/page_time_extract.json");
}
}
20 changes: 13 additions & 7 deletions src/test/java/wec/TestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import data.RawElasticResult;

import java.io.InputStream;
import java.io.InputStreamReader;
Expand All @@ -14,16 +15,18 @@
public class TestUtils {
private static final Gson GSON = new Gson();

public static List<AbstractMap.SimpleEntry<String, String>> getTextAndTitle(String fileName) {
public static List<RawElasticResult> getTextAndTitle(String fileName) {
InputStream inputStreamNlp = TestWECLinksExtractor.class.getClassLoader().getResourceAsStream(fileName);
assert inputStreamNlp != null;
JsonArray inputJsonNlp = GSON.fromJson(new InputStreamReader(inputStreamNlp), JsonArray.class);

List<AbstractMap.SimpleEntry<String, String>> retTexts = new ArrayList<>();
List<RawElasticResult> retTexts = new ArrayList<>();
for(JsonElement jsonObj : inputJsonNlp) {
AbstractMap.SimpleEntry<String, String> pair = new AbstractMap.SimpleEntry<>(jsonObj.getAsJsonObject().get("title").getAsString(),
jsonObj.getAsJsonObject().get("text").getAsString());
retTexts.add(pair);
String title = jsonObj.getAsJsonObject().get("title").getAsString();
String text = jsonObj.getAsJsonObject().get("text").getAsString();
String infobox = jsonObj.getAsJsonObject().get("infobox").getAsString();

retTexts.add(new RawElasticResult(title, text, infobox));
}

return retTexts;
Expand All @@ -43,9 +46,12 @@ public static List<JsonObject> getTextTitleAndExpected(String fileName) {
return retTexts;
}

public static String getText(String fileNme) {
public static RawElasticResult getText(String fileNme) {
InputStream inputStreamNlp = TestWECLinksExtractor.class.getClassLoader().getResourceAsStream(fileNme);
JsonObject inputJsonNlp = GSON.fromJson(new InputStreamReader(inputStreamNlp), JsonObject.class);
return inputJsonNlp.get("text").getAsString();
String text = inputJsonNlp.get("text").getAsString();
String title = inputJsonNlp.get("title").getAsString();
String infobox = inputJsonNlp.get("infobox").getAsString();
return new RawElasticResult(title, text, infobox);
}
}
Loading

0 comments on commit efb0525

Please sign in to comment.