Merge pull request #151 from cmacdonald/PR-ContentType

dont throw NPE if Content-Type is missing
yasserg · Aug 10, 2016 · 096819e · 096819e
2 parents 4a89f0a + 4bff407
commit 096819e
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 4 deletions.
diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -369,7 +369,8 @@ private void processPage(WebURL curURL) {
           String description = EnglishReasonPhraseCatalog.INSTANCE
               .getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason for all known statuses
           String contentType =
-              fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue();
+              fetchResult.getEntity() == null ? "" : 
+            	  fetchResult.getEntity().getContentType() == null ? "" : fetchResult.getEntity().getContentType().getValue();
           onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
         }
 

diff --git a/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtServer.java b/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtServer.java
@@ -101,20 +101,21 @@ private HostDirectives fetchDirectives(URL url) {
       if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
         Page page = new Page(robotsTxtUrl);
         fetchResult.fetchContent(page);
-        if (Util.hasPlainTextContent(page.getContentType())) {
+        String contentType = page.getContentType() == null ? "" : page.getContentType();
+        if (Util.hasPlainTextContent(contentType) || contentType.length() == 0) {
           String content;
           if (page.getContentCharset() == null) {
             content = new String(page.getContentData());
           } else {
             content = new String(page.getContentData(), page.getContentCharset());
           }
           directives = RobotstxtParser.parse(content, config.getUserAgentName());
-        } else if (page.getContentType().contains("html")) { // TODO This one should be upgraded to remove all html tags
+        } else if (contentType.contains("html")) { // TODO This one should be upgraded to remove all html tags
           String content = new String(page.getContentData());
           directives = RobotstxtParser.parse(content, config.getUserAgentName());
         } else {
           logger.warn("Can't read this robots.txt: {}  as it is not written in plain text, contentType: {}",
-                      robotsTxtUrl.getURL(), page.getContentType());
+                      robotsTxtUrl.getURL(), contentType);
         }
       } else {
         logger.debug("Can't read this robots.txt: {}  as it's status code is {}", robotsTxtUrl.getURL(),