Skip to content

Commit

Permalink
Added configuration parameter processBinaryContentInCrawling that
Browse files Browse the repository at this point in the history
determines if binary content must be processed by TIKA in addition to
being retrieved at all (which is controlled by
includeBinaryContentInCrawling). This is useful if you want to be able
to retrieve the binary content but do not care if links inside are
processed. This can improve the performance when handling binary
documents strongly.
  • Loading branch information
EgbertW committed May 20, 2015
1 parent b5826dc commit 9a6de42
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 1 deletion.
16 changes: 16 additions & 0 deletions src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ public class CrawlConfig {
* Should we fetch binary content such as images, audio, ...?
*/
private boolean includeBinaryContentInCrawling = false;

/**
* Should we process binary content such as image, audio, ... using TIKA?
*/
private boolean processBinaryContentInCrawling = false;

/**
* Maximum Connections per host
Expand Down Expand Up @@ -294,6 +299,17 @@ public boolean isIncludeBinaryContentInCrawling() {
public void setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling) {
this.includeBinaryContentInCrawling = includeBinaryContentInCrawling;
}

public boolean isProcessBinaryContentInCrawling() {
return processBinaryContentInCrawling;
}

/**
* Should we process binary content such as images, audio, ... using TIKA?
*/
public void setProcessBinaryContentInCrawling(boolean processBinaryContentInCrawling) {
this.processBinaryContentInCrawling = processBinaryContentInCrawling;
}

public int getMaxConnectionsPerHost() {
return maxConnectionsPerHost;
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,11 @@ public void parse(Page page, String contextURL) throws NotAllowedContentExceptio
if (Util.hasBinaryContent(page.getContentType())) { // BINARY
BinaryParseData parseData = new BinaryParseData();
if (config.isIncludeBinaryContentInCrawling()) {
parseData.setBinaryContent(page.getContentData());
if (config.isProcessBinaryContentInCrawling()) {
parseData.setBinaryContent(page.getContentData());
} else {
parseData.setHtml("<html></html>");
}
page.setParseData(parseData);
if (parseData.getHtml() == null) {
throw new ParseException();
Expand Down

0 comments on commit 9a6de42

Please sign in to comment.