Skip to content

Commit

Permalink
experiment
Browse files Browse the repository at this point in the history
  • Loading branch information
Alon Eirew committed Aug 22, 2020
1 parent e80d52b commit 06574c2
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,7 @@ src/main/java/workers/ReadCompanyFactory.java
src/main/java/workers/ReadCompanyWorker.java
src/main/java/experimentscripts/ExtractCompanyMain.java
src/main/java/wec/extractors/ExtractCompany.java
src/test/java/wec/TestExtractCompany.java
src/main/java/data/CompanyObj.java
infobox_config/company_info.json
src/test/resources/wiki_links/company.json
10 changes: 9 additions & 1 deletion src/main/java/utils/WikipediaUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,21 @@
public class WikipediaUtils {
public static String cleanTextField(String html) {
String cleanHtml = html;
Pattern pat1 = Pattern.compile("(?s)\\{\\{[^{]*?\\}\\}");
Pattern pat1 = Pattern.compile("(?s)\\{\\{.*}}");
Pattern pat2 = Pattern.compile("(?s)\\[.*]");
Matcher match1 = pat1.matcher(cleanHtml);

while (match1.find()) {
cleanHtml = match1.replaceAll("");
match1 = pat1.matcher(cleanHtml);
}

Matcher match2 = pat2.matcher(cleanHtml);
while (match2.find()) {
cleanHtml = match2.replaceAll("");
match2 = pat2.matcher(cleanHtml);
}

return cleanHtml;
}
}

0 comments on commit 06574c2

Please sign in to comment.