Added configuration parameter processBinaryContentInCrawling that

determines if binary content must be processed by TIKA in addition to being retrieved at all (which is controlled by includeBinaryContentInCrawling). This is useful if you want to be able to retrieve the binary content but do not care if links inside are processed. This can improve the performance when handling binary documents strongly.
yasserg · May 20, 2015 · 9a6de42 · 9a6de42
1 parent b5826dc
commit 9a6de42
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 1 deletion.
diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
@@ -79,6 +79,11 @@ public class CrawlConfig {
    * Should we fetch binary content such as images, audio, ...?
    */
   private boolean includeBinaryContentInCrawling = false;
+
+  /**
+   * Should we process binary content such as image, audio, ... using TIKA?
+   */
+  private boolean processBinaryContentInCrawling = false;
 
   /**
    * Maximum Connections per host
@@ -294,6 +299,17 @@ public boolean isIncludeBinaryContentInCrawling() {
   public void setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling) {
     this.includeBinaryContentInCrawling = includeBinaryContentInCrawling;
   }
+
+  public boolean isProcessBinaryContentInCrawling() {
+    return processBinaryContentInCrawling;
+  }
+
+  /**
+   * Should we process binary content such as images, audio, ... using TIKA?
+   */
+  public void setProcessBinaryContentInCrawling(boolean processBinaryContentInCrawling) {
+    this.processBinaryContentInCrawling = processBinaryContentInCrawling;
+  }  
 
   public int getMaxConnectionsPerHost() {
     return maxConnectionsPerHost;

diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
@@ -60,7 +60,11 @@ public void parse(Page page, String contextURL) throws NotAllowedContentExceptio
     if (Util.hasBinaryContent(page.getContentType())) { // BINARY
       BinaryParseData parseData = new BinaryParseData();
       if (config.isIncludeBinaryContentInCrawling()) {
-        parseData.setBinaryContent(page.getContentData());
+        if (config.isProcessBinaryContentInCrawling()) {
+          parseData.setBinaryContent(page.getContentData());
+        } else {
+          parseData.setHtml("<html></html>");
+        }
         page.setParseData(parseData);
         if (parseData.getHtml() == null) {
           throw new ParseException();