Skip to content

Commit

Permalink
Fix issue with UTF-8 BOM when charset only in HTML.
Browse files Browse the repository at this point in the history
Fixes #348
  • Loading branch information
jhy committed Nov 18, 2013
1 parent 6c4f16f commit 3f9f33d
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 7 deletions.
5 changes: 5 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ jsoup changelog
* Fixed an issue where <svg><img/></svg> was parsed as <svg><image/></svg>
<https://github.com/jhy/jsoup/issues/364>

* Fixed an issue where a UTF-8 BOM character was not detected if the HTTP response did not specify a charset, and
the HTML body did, leading to the head contents incorrectly being parsed into the body. Changed the behavior so that
when the UTF-8 BOM is detected, it will take precedence for determining the charset to decode with.
<https://github.com/jhy/jsoup/issues/348>

*** Release 1.7.3 [2013-Nov-10]
* Introduced FormElement, providing easy access to form controls and their data, and the ability to submit forms
with Jsoup.Connect.
Expand Down
16 changes: 9 additions & 7 deletions src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ public static Document load(InputStream in, String charsetName, String baseUri,

// reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
// switching the chartset midstream when a meta http-equiv tag defines the charset.
// todo - this is getting gnarly. needs a rewrite.
static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
String docData;
Document doc = null;
Expand All @@ -81,7 +82,6 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba
doc = parser.parseInput(docData, baseUri);
Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
if (meta != null) { // if not found, will keep utf-8 as best attempt

String foundCharset;
if (meta.hasAttr("http-equiv")) {
foundCharset = getCharsetFromContentType(meta.attr("content"));
Expand Down Expand Up @@ -110,13 +110,15 @@ static Document parseByteData(ByteBuffer byteData, String charsetName, String ba
Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
docData = Charset.forName(charsetName).decode(byteData).toString();
}
// UTF-8 BOM indicator. takes precedence over everything else. rarely used. re-decodes incase above decoded incorrectly
if (docData.length() > 0 && docData.charAt(0) == 65279) {
byteData.rewind();
docData = Charset.forName(defaultCharset).decode(byteData).toString();
docData = docData.substring(1);
charsetName = defaultCharset;
doc = null;
}
if (doc == null) {
// there are times where there is a spurious byte-order-mark at the start of the text. Shouldn't be present
// in utf-8. If after decoding, there is a BOM, strip it; otherwise will cause the parser to go straight
// into head mode
if (docData.length() > 0 && docData.charAt(0) == 65279)
docData = docData.substring(1);

doc = parser.parseInput(docData, baseUri);
doc.outputSettings().charset(charsetName);
}
Expand Down
8 changes: 8 additions & 0 deletions src/test/java/org/jsoup/helper/DataUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@ public void testCharset() {
assertEquals("One", doc.head().text());
}

@Test public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() {
String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
ByteBuffer buffer = Charset.forName("UTF-8").encode(html);
Document doc = DataUtil.parseByteData(buffer, null, "http://foo.com/", Parser.htmlParser());
assertEquals("One", doc.head().text());
assertEquals("UTF-8", doc.outputSettings().charset().displayName());
}

@Test
public void shouldNotThrowExceptionOnEmptyCharset() {
assertEquals(null, DataUtil.getCharsetFromContentType("text/html; charset="));
Expand Down

0 comments on commit 3f9f33d

Please sign in to comment.