Skip to content

Commit

Permalink
commit before refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
Alon Eirew committed Feb 23, 2021
1 parent 02dfa99 commit 04b212b
Show file tree
Hide file tree
Showing 8 changed files with 429 additions and 38 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ src/test/java/wec/TestExtractCompany.java
src/main/java/data/CompanyObj.java
infobox_config/company_info.json
src/test/resources/wiki_links/company.json

EnWikiLinksAllEvents_v10.db
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies {
compile group: 'edu.stanford.nlp', name: 'stanford-corenlp', classifier: 'models', version: '3.9.1'
compile group: 'org.apache.commons', name: 'commons-dbcp2', version: '2.6.0'
compile group: 'org.apache.commons', name: 'commons-text', version: '1.6'
compile group: 'commons-codec', name: 'commons-codec', version: '1.15'
compile group: 'org.xerial', name: 'sqlite-jdbc', version: '3.27.2.1'
compile group: 'info.bliki.wiki', name: 'bliki-core', version: '3.1.0'
compile group: 'org.jsoup', name: 'jsoup', version: '1.13.1'
Expand Down
8 changes: 7 additions & 1 deletion src/main/java/data/BaseMention.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ public BaseMention(long mentionId) {
this.mentionId = mentionId;
}

public BaseMention(BaseMention mention) {
this(mention.mentionId, mention.corefId,
mention.tokenStart, mention.tokenEnd,
mention.extractedFromPage, mention.context);
}

public BaseMention(long mentionId, int corefId, int tokenStart, int tokenEnd, String extractedFromPage, JsonArray context) {
this.mentionId = mentionId;
this.corefId = corefId;
Expand Down Expand Up @@ -73,7 +79,7 @@ public void setContext(JsonArray context) {
this.context = context;
}

protected String getContextAsSQLBlob() {
public String getContextAsJsonString() {
return GSON.toJson(this.context);
}

Expand Down
13 changes: 11 additions & 2 deletions src/main/java/data/WECMention.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,15 @@ public WECMention() {
super(runningId.incrementAndGet());
}

public WECMention(WECMention mention) {
super(mention);
this.coreChain = mention.coreChain;
this.mentionText = mention.mentionText;
if(this.coreChain != null) {
this.coreChain.incMentionsCount();
}
}

public WECMention(WECCoref coref, String mentionText,
int tokenStart, int tokenEnd, String extractedFromPage, JsonArray context) {
super(runningId.incrementAndGet(), coref.getCorefId(), tokenStart, tokenEnd, extractedFromPage, context);
Expand Down Expand Up @@ -81,7 +90,7 @@ public String getValues() {
getTokenStart() + "," +
getTokenEnd() + "," +
"'" + getExtractedFromPage() + "'" + "," +
"'" + getContextAsSQLBlob() + "'" +
"'" + getContextAsJsonString() + "'" +
"'" + String.join(", ", this.mentionTokensPos) + "'";
}

Expand All @@ -98,7 +107,7 @@ public void setPrepareInsertStatementValues(PreparedStatement preparedStatement)
preparedStatement.setInt(4, this.getTokenStart());
preparedStatement.setInt(5, this.getTokenEnd());
preparedStatement.setString(6, this.getExtractedFromPage());
preparedStatement.setString(7, getContextAsSQLBlob());
preparedStatement.setString(7, getContextAsJsonString());
preparedStatement.setString(8, String.join(", ", this.mentionTokensPos));
}

Expand Down
16 changes: 16 additions & 0 deletions src/main/java/data/WECMentionSubEvent.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package data;

import java.util.Set;

public class WECMentionSubEvent extends WECMention {
private final Set<Integer> subEventOf;

public WECMentionSubEvent(WECMention mention, Set<Integer> subEventOf) {
super(mention);
this.subEventOf = subEventOf;
}

public Set<Integer> getSubEventOf() {
return subEventOf;
}
}
173 changes: 168 additions & 5 deletions src/main/java/experimentscripts/event/GenMoreEventsStats.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,186 @@

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonArray;
import com.google.gson.reflect.TypeToken;
import data.BaseMention;
import data.EventSubEventPair;
import data.WECCoref;
import data.WECMentionSubEvent;
import persistence.SQLQueryApi;
import persistence.SQLiteConnections;

import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.lang.reflect.Type;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;

public class GenMoreEventsStats {
private final static Gson GSON = new GsonBuilder().setPrettyPrinting().create();
private final static Gson GSON = new GsonBuilder().setPrettyPrinting().disableHtmlEscaping().create();

private static final String INPUT_JSON_PAIRS = "output/sub_events_sharing_context.json";
private static final String INPUT_JSON_CONTEXTS = "output/context_mentions_file.json";
private static final String INPUT_JSON_CONTEXTS_HASH = "output/hashcode_to_context.json";

// private static final String SQL_URL = "jdbc:sqlite:/Users/aeirew/workspace/DataBase/EnWikiLinksAllEvents_v10.db";
private static final String SQL_URL = "jdbc:sqlite:EnWikiLinksAllEvents_v10.db";
private static final int MAX_EXAMPLES = 5;

private static final String INPUT_JSON = "output/sub_events_sharing_context.json";

public static void main(String[] args) throws Exception {
Type listType = new TypeToken<ArrayList<EventSubEventPair>>(){}.getType();
List<EventSubEventPair> eventPairs = GSON.fromJson(new FileReader(INPUT_JSON), listType);
// extractMorePairsStats();
// extractContextStats();
// genWithManyMentionsInContextExamples();
// genOfEachSubTypeExample();
generateCorefMentionsDist();
System.out.println();
}

private static void extractMorePairsStats() throws FileNotFoundException {
Type listType = new TypeToken<ArrayList<EventSubEventPair>>(){}.getType();
List<EventSubEventPair> eventPairs = GSON.fromJson(new FileReader(INPUT_JSON_PAIRS), listType);
Map<String, AtomicInteger> pairsDist = new HashMap<>();
for(EventSubEventPair pair : eventPairs) {
String pairId = pair.getEvent().getCorefId() + "_" + pair.getSubEvent().getCorefId();
if(!pairsDist.containsKey(pairId)) {
pairsDist.put(pairId, new AtomicInteger(0));
}

pairsDist.get(pairId).incrementAndGet();
}

Map<Integer, AtomicInteger> timesAppear = new HashMap<>();
for (AtomicInteger value : pairsDist.values()) {
if(!timesAppear.containsKey(value.get())) {
timesAppear.put(value.get(), new AtomicInteger(0));
}

timesAppear.get(value.get()).incrementAndGet();
}

System.out.println(pairsDist.size());
System.out.println(GSON.toJson(pairsDist));
System.out.println(GSON.toJson(timesAppear));

}

private static void extractContextStats() throws FileNotFoundException {
Type listType = new TypeToken<Map<String, List<WECMentionSubEvent>>>(){}.getType();
Map<String, List<WECMentionSubEvent>> eventsInContext = GSON.fromJson(new FileReader(INPUT_JSON_CONTEXTS), listType);

SQLQueryApi queryApi = new SQLQueryApi(new SQLiteConnections(SQL_URL));

Map<Integer, AtomicInteger> contextDistributions = new HashMap<>();
Map<String, AtomicInteger> mentionsTypeDistributions = new HashMap<>();
Map<String, AtomicInteger> mentionsSubTypeDistributions = new HashMap<>();
for(List<WECMentionSubEvent> contextEvents : eventsInContext.values()) {
if(!contextDistributions.containsKey(contextEvents.size())) {
contextDistributions.put(contextEvents.size(), new AtomicInteger());
}
contextDistributions.get(contextEvents.size()).incrementAndGet();

WECMentionSubEvent mentionSubEvent = contextEvents.get(0);
WECCoref corefById = queryApi.getCorefById(mentionSubEvent.getCorefId());
if(!mentionsTypeDistributions.containsKey(corefById.getCorefType())) {
mentionsTypeDistributions.put(corefById.getCorefType(), new AtomicInteger());
}
mentionsTypeDistributions.get(corefById.getCorefType()).addAndGet(contextEvents.size());

if(!mentionsSubTypeDistributions.containsKey(corefById.getCorefSubType())) {
mentionsSubTypeDistributions.put(corefById.getCorefSubType(), new AtomicInteger());
}
mentionsSubTypeDistributions.get(corefById.getCorefSubType()).addAndGet(contextEvents.size());
}

System.out.println("Context Distribution:");
System.out.println(GSON.toJson(contextDistributions));
System.out.println("Mentions Type Distribution:");
System.out.println(GSON.toJson(mentionsTypeDistributions));
System.out.println("Mentions Sub-Type Distribution:");
System.out.println(GSON.toJson(mentionsSubTypeDistributions));
}

private static void genWithManyMentionsInContextExamples() throws FileNotFoundException {
Type listType = new TypeToken<Map<String, List<WECMentionSubEvent>>>(){}.getType();
Map<String, List<WECMentionSubEvent>> eventPairs = GSON.fromJson(new FileReader(INPUT_JSON_CONTEXTS), listType);

Type contextType = new TypeToken<Map<String, JsonArray>>(){}.getType();
Map<String, JsonArray> contextToHash = GSON.fromJson(new FileReader(INPUT_JSON_CONTEXTS_HASH), contextType);

Map<Integer, Boolean> distributions = new HashMap<>();
for(String contextKey : eventPairs.keySet()) {
List<WECMentionSubEvent> contextEvents = eventPairs.get(contextKey);
if(contextEvents.size() %2 == 0 && !distributions.containsKey(contextEvents.size())) {
System.out.println("Found context " + contextKey + " with " + contextEvents.size() + " mentions");
distributions.put(contextEvents.size(), Boolean.TRUE);
contextEvents.get(0).setContext(contextToHash.get(contextKey));

System.out.println(String.join(" " , ReadFilteredJsonAndProcess.printSingleParagraph(contextEvents)));
System.out.println();
}
}
}

private static void genOfEachSubTypeExample() throws FileNotFoundException {
SQLQueryApi queryApi = new SQLQueryApi(new SQLiteConnections(SQL_URL));
Type listType = new TypeToken<Map<String, List<WECMentionSubEvent>>>(){}.getType();
Map<String, List<WECMentionSubEvent>> eventPairs = GSON.fromJson(new FileReader(INPUT_JSON_CONTEXTS), listType);

Type contextType = new TypeToken<Map<String, JsonArray>>(){}.getType();
Map<String, JsonArray> contextToHash = GSON.fromJson(new FileReader(INPUT_JSON_CONTEXTS_HASH), contextType);

Map<String, AtomicInteger> distributions = new HashMap<>();
for(String contextKey : eventPairs.keySet()) {
List<WECMentionSubEvent> contextEvents = eventPairs.get(contextKey);
WECCoref corefById = queryApi.getCorefById(contextEvents.get(0).getCorefId());
if(!distributions.containsKey(corefById.getCorefType())) {
distributions.put(corefById.getCorefType(), new AtomicInteger(1));
}

if(distributions.get(corefById.getCorefType()).get() <= MAX_EXAMPLES) {
System.out.println("Found context " + corefById.getCorefType());
contextEvents.get(0).setContext(contextToHash.get(contextKey));
System.out.println(String.join(" ", ReadFilteredJsonAndProcess.printSingleParagraph(contextEvents)));
System.out.println();
distributions.get(corefById.getCorefType()).incrementAndGet();
}
}
}

private static void generateCorefMentionsDist() throws IOException {
Type listType = new TypeToken<Map<String, List<WECMentionSubEvent>>>(){}.getType();
Map<String, List<WECMentionSubEvent>> eventsInContext = GSON.fromJson(new FileReader(INPUT_JSON_CONTEXTS, StandardCharsets.UTF_8), listType);

Map<Integer, AtomicInteger> corefMentionsDist = new HashMap<>();
Map<Integer, String> corefIdToTitle = new HashMap<>();
for(List<WECMentionSubEvent> contextEvents : eventsInContext.values()) {
for(WECMentionSubEvent mentionSubEvent : contextEvents) {
if (!corefMentionsDist.containsKey(mentionSubEvent.getCorefId())) {
corefMentionsDist.put(mentionSubEvent.getCorefId(), new AtomicInteger());
corefIdToTitle.put(mentionSubEvent.getCorefId(), mentionSubEvent.getMentionText().replaceAll(":", ";"));
}
corefMentionsDist.get(mentionSubEvent.getCorefId()).incrementAndGet();
}
}

Map<String, Integer> finalDist = new HashMap<>();
for(Integer corefId : corefMentionsDist.keySet()) {
finalDist.put(corefIdToTitle.get(corefId), corefMentionsDist.get(corefId).get());
}

// System.out.println("Coref Mentions Distribution:");
// System.out.println(GSON.toJson(finalDist));

FileWriter writer1 = new FileWriter("output/generated_stats.json");
GSON.toJson(corefMentionsDist, writer1);
writer1.flush();
writer1.close();
System.out.println("Done!");
}
}
Loading

0 comments on commit 04b212b

Please sign in to comment.