Skip to content

Commit

Permalink
Added support for loading gzipped HTML files.
Browse files Browse the repository at this point in the history
Mostly this is so I can compress the test HTML files in the repo,
so that GitHub stops reporting jsoup as a HTML project.
  • Loading branch information
jhy committed Mar 1, 2020
1 parent 5136929 commit 34715b3
Show file tree
Hide file tree
Showing 31 changed files with 84 additions and 10,574 deletions.
3 changes: 3 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
jsoup changelog

*** Release 1.13.2 [PENDING]
* Improvement: added support for loading and parsing gzipped HTML files in Jsoup.parse(File in, charset, baseUri).

*** Release 1.13.1 [2020-Feb-29]
* Improvement: added Element#closest(selector), which walks up the tree to find the nearest element matching the
selector.
Expand Down
19 changes: 16 additions & 3 deletions src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import org.jsoup.UncheckedIOException;
import org.jsoup.internal.ConstrainableInputStream;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Comment;
import org.jsoup.nodes.Document;
Expand All @@ -28,6 +29,7 @@
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

/**
* Internal static utilities for handling data.
Expand All @@ -45,15 +47,26 @@ public final class DataUtil {
private DataUtil() {}

/**
* Loads a file to a Document.
* Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
* are supported in addition to uncompressed files.
*
* @param in file to load
* @param charsetName character set of input
* @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
* the file will always override this setting.
* @param baseUri base URI of document, to resolve relative links against
* @return Document
* @throws IOException on IO error
*/
public static Document load(File in, String charsetName, String baseUri) throws IOException {
return parseInputStream(new FileInputStream(in), charsetName, baseUri, Parser.htmlParser());
InputStream stream = new FileInputStream(in);
String name = Normalizer.lowerCase(in.getName());
if (name.endsWith(".gz") || name.endsWith(".z")) {
// unfortunately file input streams don't support marks (why not?), so we will close and reopen after read
boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
stream.close();
stream = zipped ? new GZIPInputStream(new FileInputStream(in)) : new FileInputStream(in);
}
return parseInputStream(stream, charsetName, baseUri, Parser.htmlParser());
}

/**
Expand Down
31 changes: 31 additions & 0 deletions src/test/java/org/jsoup/helper/DataUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,14 @@ public void supportsUTF8BOM() throws IOException {
assertEquals("OK", doc.head().select("title").text());
}

@Test
public void supportsZippedUTF8BOM() throws IOException {
File in = getFile("/bomtests/bom_utf8.html.gz");
Document doc = Jsoup.parse(in, null, "http://example.com");
assertEquals("OK", doc.head().select("title").text());
assertEquals("There is a UTF8 BOM at the top (before the XML decl). If not read correctly, will look like a non-joining space.", doc.body().text());
}

@Test
public void supportsXmlCharsetDeclaration() throws IOException {
String encoding = "iso-8859-1";
Expand All @@ -177,4 +185,27 @@ public void supportsXmlCharsetDeclaration() throws IOException {
Document doc = Jsoup.parse(soup, null, "");
assertEquals("Hellö Wörld!", doc.body().text());
}


@Test public void lLoadsGzipFile() throws IOException {
File in = getFile("/htmltests/gzip.html.gz");
Document doc = Jsoup.parse(in, null);
assertEquals("Gzip test", doc.title());
assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
}

@Test public void loadsZGzipFile() throws IOException {
// compressed on win, with z suffix
File in = getFile("/htmltests/gzip.html.z");
Document doc = Jsoup.parse(in, null);
assertEquals("Gzip test", doc.title());
assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
}

@Test public void handlesFakeGzipFile () throws IOException {
File in = getFile("/htmltests/fake-gzip.html.gz");
Document doc = Jsoup.parse(in, null);
assertEquals("This is not gzipped", doc.title());
assertEquals("And should still be readable.", doc.selectFirst("p").text());
}
}
4 changes: 2 additions & 2 deletions src/test/java/org/jsoup/helper/W3CDomTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public void simpleConversion() {

@Test
public void convertsGoogle() throws IOException {
File in = ParseTest.getFile("/htmltests/google-ipod.html");
File in = ParseTest.getFile("/htmltests/google-ipod.html.gz");
org.jsoup.nodes.Document doc = Jsoup.parse(in, "UTF8");

W3CDom w3c = new W3CDom();
Expand All @@ -108,7 +108,7 @@ public void convertsGoogle() throws IOException {

@Test
public void convertsGoogleLocation() throws IOException {
File in = ParseTest.getFile("/htmltests/google-ipod.html");
File in = ParseTest.getFile("/htmltests/google-ipod.html.gz");
org.jsoup.nodes.Document doc = Jsoup.parse(in, "UTF8");

W3CDom w3c = new W3CDom();
Expand Down
6 changes: 3 additions & 3 deletions src/test/java/org/jsoup/integration/ConnectTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ public void doesPut() throws IOException {
@Test
public void postFiles() throws IOException {
File thumb = ParseTest.getFile("/htmltests/thumb.jpg");
File html = ParseTest.getFile("/htmltests/google-ipod.html");
File html = ParseTest.getFile("/htmltests/google-ipod.html.gz");

Document res = Jsoup
.connect(EchoServlet.Url)
Expand All @@ -247,8 +247,8 @@ public void postFiles() throws IOException {

assertEquals("application/octet-stream", ihVal("Part secondPart ContentType", res));
assertEquals("secondPart", ihVal("Part secondPart Name", res));
assertEquals("google-ipod.html", ihVal("Part secondPart Filename", res));
assertEquals("43963", ihVal("Part secondPart Size", res));
assertEquals("google-ipod.html.gz", ihVal("Part secondPart Filename", res));
assertEquals("12212", ihVal("Part secondPart Size", res));

assertEquals("image/jpeg", ihVal("Part firstPart ContentType", res));
assertEquals("firstPart", ihVal("Part firstPart Name", res));
Expand Down
34 changes: 22 additions & 12 deletions src/test/java/org/jsoup/integration/ParseTest.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.jsoup.integration;

import org.jsoup.Jsoup;
import org.jsoup.helper.DataUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.ParseErrorList;
Expand All @@ -11,8 +12,10 @@
import java.io.*;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.zip.GZIPInputStream;

import static org.junit.Assert.*;

Expand All @@ -25,7 +28,7 @@ public class ParseTest {

@Test
public void testSmhBizArticle() throws IOException {
File in = getFile("/htmltests/smh-biz-article-1.html");
File in = getFile("/htmltests/smh-biz-article-1.html.gz");
Document doc = Jsoup.parse(in, "UTF-8",
"http://www.smh.com.au/business/the-boards-next-fear-the-female-quota-20100106-lteq.html");
assertEquals("The board’s next fear: the female quota",
Expand All @@ -40,7 +43,7 @@ public void testSmhBizArticle() throws IOException {

@Test
public void testNewsHomepage() throws IOException {
File in = getFile("/htmltests/news-com-au-home.html");
File in = getFile("/htmltests/news-com-au-home.html.gz");
Document doc = Jsoup.parse(in, "UTF-8", "http://www.news.com.au/");
assertEquals("News.com.au | News from Australia and around the world online | NewsComAu", doc.title());
assertEquals("Brace yourself for Metro meltdown", doc.select(".id1225817868581 h4").text().trim());
Expand All @@ -58,7 +61,7 @@ public void testNewsHomepage() throws IOException {

@Test
public void testGoogleSearchIpod() throws IOException {
File in = getFile("/htmltests/google-ipod.html");
File in = getFile("/htmltests/google-ipod.html.gz");
Document doc = Jsoup.parse(in, "UTF-8", "http://www.google.com/search?hl=en&q=ipod&aq=f&oq=&aqi=g10");
assertEquals("ipod - Google Search", doc.title());
Elements results = doc.select("h3.r > a");
Expand All @@ -72,7 +75,7 @@ public void testGoogleSearchIpod() throws IOException {

@Test
public void testYahooJp() throws IOException {
File in = getFile("/htmltests/yahoo-jp.html");
File in = getFile("/htmltests/yahoo-jp.html.gz");
Document doc = Jsoup.parse(in, "UTF-8", "http://www.yahoo.co.jp/index.html"); // http charset is utf-8.
assertEquals("Yahoo! JAPAN", doc.title());
Element a = doc.select("a[href=t/2322m2]").first();
Expand Down Expand Up @@ -150,7 +153,7 @@ public void testBrokenHtml5CharsetWithASingleDoubleQuote() throws IOException {
@Test
public void testNytArticle() throws IOException {
// has tags like <nyt_text>
File in = getFile("/htmltests/nyt-article-1.html");
File in = getFile("/htmltests/nyt-article-1.html.gz");
Document doc = Jsoup.parse(in, null, "http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp");

Element headline = doc.select("nyt_headline[version=1.0]").first();
Expand All @@ -159,7 +162,7 @@ public void testNytArticle() throws IOException {

@Test
public void testYahooArticle() throws IOException {
File in = getFile("/htmltests/yahoo-article-1.html");
File in = getFile("/htmltests/yahoo-article-1.html.gz");
Document doc = Jsoup.parse(in, "UTF-8", "http://news.yahoo.com/s/nm/20100831/bs_nm/us_gm_china");
Element p = doc.select("p:contains(Volt will be sold in the United States)").first();
assertEquals("In July, GM said its electric Chevrolet Volt will be sold in the United States at $41,000 -- $8,000 more than its nearest competitor, the Nissan Leaf.", p.text());
Expand All @@ -179,7 +182,7 @@ public void testLowercaseUtf8Charset() throws IOException {
public void testXwiki() throws IOException {
// https://github.com/jhy/jsoup/issues/1324
// this tests that when in CharacterReader we hit a buffer while marked, we preserve the mark when buffered up and can rewind
File in = getFile("/htmltests/xwiki-1324.html");
File in = getFile("/htmltests/xwiki-1324.html.gz");
Document doc = Jsoup.parse(in, null, "https://localhost/");
assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());

Expand All @@ -194,9 +197,9 @@ public void testXwikiExpanded() throws IOException {
// https://github.com/jhy/jsoup/issues/1324
// this tests that if there is a huge illegal character reference, we can get through a buffer and rewind, and still catch that it's an invalid refence,
// and the parse tree is correct.
File in = getFile("/htmltests/xwiki-edit.html");
File in = getFile("/htmltests/xwiki-edit.html.gz");
Parser parser = Parser.htmlParser();
Document doc = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://localhost/", parser.setTrackErrors(100));
Document doc = Jsoup.parse(new GZIPInputStream(new FileInputStream(in)), "UTF-8", "https://localhost/", parser.setTrackErrors(100));
ParseErrorList errors = parser.getErrors();

assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
Expand All @@ -209,7 +212,7 @@ public void testXwikiExpanded() throws IOException {
}

@Test public void testWikiExpandedFromString() throws IOException {
File in = getFile("/htmltests/xwiki-edit.html");
File in = getFile("/htmltests/xwiki-edit.html.gz");
String html = getFileAsString(in);
Document doc = Jsoup.parse(html);
assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
Expand All @@ -218,7 +221,7 @@ public void testXwikiExpanded() throws IOException {
}

@Test public void testWikiFromString() throws IOException {
File in = getFile("/htmltests/xwiki-1324.html");
File in = getFile("/htmltests/xwiki-1324.html.gz");
String html = getFileAsString(in);
Document doc = Jsoup.parse(html);
assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
Expand All @@ -240,7 +243,14 @@ public static InputStream inputStreamFrom(String s) {
}

public static String getFileAsString(File file) throws IOException {
byte[] bytes = Files.readAllBytes(file.toPath());
byte[] bytes;
if (file.getName().endsWith(".gz")) {
InputStream stream = new GZIPInputStream(new FileInputStream(file));
ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0);
bytes = byteBuffer.array();
} else {
bytes = Files.readAllBytes(file.toPath());
}
return new String(bytes);
}

Expand Down
4 changes: 2 additions & 2 deletions src/test/java/org/jsoup/nodes/DocumentTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,13 @@ public class DocumentTest {
}

@Test public void testLocation() throws IOException {
File in = ParseTest.getFile("/htmltests/yahoo-jp.html");
File in = ParseTest.getFile("/htmltests/yahoo-jp.html.gz");
Document doc = Jsoup.parse(in, "UTF-8", "http://www.yahoo.co.jp/index.html");
String location = doc.location();
String baseUri = doc.baseUri();
assertEquals("http://www.yahoo.co.jp/index.html",location);
assertEquals("http://www.yahoo.co.jp/_ylh=X3oDMTB0NWxnaGxsBF9TAzIwNzcyOTYyNjUEdGlkAzEyBHRtcGwDZ2Ex/",baseUri);
in = ParseTest.getFile("/htmltests/nyt-article-1.html");
in = ParseTest.getFile("/htmltests/nyt-article-1.html.gz");
doc = Jsoup.parse(in, null, "http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp");
location = doc.location();
baseUri = doc.baseUri();
Expand Down
4 changes: 1 addition & 3 deletions src/test/java/org/jsoup/parser/HtmlParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1227,7 +1227,7 @@ public void testInvalidTableContents() throws IOException {
}

@Test public void characterReaderBuffer() throws IOException {
File in = ParseTest.getFile("/htmltests/character-reader-buffer.html");
File in = ParseTest.getFile("/htmltests/character-reader-buffer.html.gz");
Document doc = Jsoup.parse(in, "UTF-8");

String expectedHref = "http://www.domain.com/path?param_one=value&param_two=value";
Expand Down Expand Up @@ -1360,7 +1360,6 @@ public void testUNewlines() {
String html = "\n<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n";
Document doc = Jsoup.parse(html);
doc.outputSettings().prettyPrint(false);
System.out.println(doc.outerHtml());
assertEquals("<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n\n</body></html>\n", doc.outerHtml());
}

Expand All @@ -1369,7 +1368,6 @@ public void testUNewlines() {
// todo - ideally would move that space afer /html to the body when the There <p> is seen
Document doc = Jsoup.parse(html);
doc.outputSettings().prettyPrint(false);
System.out.println(doc.outerHtml());
assertEquals("<html><head></head><body>One <p>Hello!</p><p>There</p></body></html> ", doc.outerHtml());
}
}
Binary file added src/test/resources/bomtests/bom_utf8.html.gz
Binary file not shown.
Loading

0 comments on commit 34715b3

Please sign in to comment.