Skip to content

Commit

Permalink
Merge pull request #151 from cmacdonald/PR-ContentType
Browse files Browse the repository at this point in the history
dont throw NPE if Content-Type is missing
  • Loading branch information
yasserg authored Aug 10, 2016
2 parents 4a89f0a + 4bff407 commit 096819e
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 4 deletions.
3 changes: 2 additions & 1 deletion src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,8 @@ private void processPage(WebURL curURL) {
String description = EnglishReasonPhraseCatalog.INSTANCE
.getReason(fetchResult.getStatusCode(), Locale.ENGLISH); // Finds the status reason for all known statuses
String contentType =
fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType().getValue();
fetchResult.getEntity() == null ? "" :
fetchResult.getEntity().getContentType() == null ? "" : fetchResult.getEntity().getContentType().getValue();
onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,20 +101,21 @@ private HostDirectives fetchDirectives(URL url) {
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
Page page = new Page(robotsTxtUrl);
fetchResult.fetchContent(page);
if (Util.hasPlainTextContent(page.getContentType())) {
String contentType = page.getContentType() == null ? "" : page.getContentType();
if (Util.hasPlainTextContent(contentType) || contentType.length() == 0) {
String content;
if (page.getContentCharset() == null) {
content = new String(page.getContentData());
} else {
content = new String(page.getContentData(), page.getContentCharset());
}
directives = RobotstxtParser.parse(content, config.getUserAgentName());
} else if (page.getContentType().contains("html")) { // TODO This one should be upgraded to remove all html tags
} else if (contentType.contains("html")) { // TODO This one should be upgraded to remove all html tags
String content = new String(page.getContentData());
directives = RobotstxtParser.parse(content, config.getUserAgentName());
} else {
logger.warn("Can't read this robots.txt: {} as it is not written in plain text, contentType: {}",
robotsTxtUrl.getURL(), page.getContentType());
robotsTxtUrl.getURL(), contentType);
}
} else {
logger.debug("Can't read this robots.txt: {} as it's status code is {}", robotsTxtUrl.getURL(),
Expand Down

0 comments on commit 096819e

Please sign in to comment.