Added support for loading gzipped HTML files.

Mostly this is so I can compress the test HTML files in the repo, so that GitHub stops reporting jsoup as a HTML project.
jhy · Mar 1, 2020 · 34715b3 · 34715b3
1 parent 5136929
commit 34715b3
Show file tree

Hide file tree

Showing 31 changed files with 84 additions and 10,574 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,5 +1,8 @@
 jsoup changelog
 
+*** Release 1.13.2 [PENDING]
+  * Improvement: added support for loading and parsing gzipped HTML files in Jsoup.parse(File in, charset, baseUri).
+
 *** Release 1.13.1 [2020-Feb-29]
   * Improvement: added Element#closest(selector), which walks up the tree to find the nearest element matching the
     selector.

diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java
@@ -2,6 +2,7 @@
 
 import org.jsoup.UncheckedIOException;
 import org.jsoup.internal.ConstrainableInputStream;
+import org.jsoup.internal.Normalizer;
 import org.jsoup.internal.StringUtil;
 import org.jsoup.nodes.Comment;
 import org.jsoup.nodes.Document;
@@ -28,6 +29,7 @@
 import java.util.Random;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
+import java.util.zip.GZIPInputStream;
 
 /**
  * Internal static utilities for handling data.
@@ -45,15 +47,26 @@ public final class DataUtil {
     private DataUtil() {}
 
     /**
-     * Loads a file to a Document.
+     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
+     * are supported in addition to uncompressed files.
+     *
      * @param in file to load
-     * @param charsetName character set of input
+     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
+     *     the file will always override this setting.
      * @param baseUri base URI of document, to resolve relative links against
      * @return Document
      * @throws IOException on IO error
      */
     public static Document load(File in, String charsetName, String baseUri) throws IOException {
-        return parseInputStream(new FileInputStream(in), charsetName, baseUri, Parser.htmlParser());
+        InputStream stream = new FileInputStream(in);
+        String name = Normalizer.lowerCase(in.getName());
+        if (name.endsWith(".gz") || name.endsWith(".z")) {
+            // unfortunately file input streams don't support marks (why not?), so we will close and reopen after read
+            boolean zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
+            stream.close();
+            stream = zipped ? new GZIPInputStream(new FileInputStream(in)) : new FileInputStream(in);
+        }
+        return parseInputStream(stream, charsetName, baseUri, Parser.htmlParser());
     }
 
     /**

diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java
@@ -165,6 +165,14 @@ public void supportsUTF8BOM() throws IOException {
         assertEquals("OK", doc.head().select("title").text());
     }
 
+    @Test
+    public void supportsZippedUTF8BOM() throws IOException {
+        File in = getFile("/bomtests/bom_utf8.html.gz");
+        Document doc = Jsoup.parse(in, null, "http://example.com");
+        assertEquals("OK", doc.head().select("title").text());
+        assertEquals("There is a UTF8 BOM at the top (before the XML decl). If not read correctly, will look like a non-joining space.", doc.body().text());
+    }
+
     @Test
     public void supportsXmlCharsetDeclaration() throws IOException {
         String encoding = "iso-8859-1";
@@ -177,4 +185,27 @@ public void supportsXmlCharsetDeclaration() throws IOException {
         Document doc = Jsoup.parse(soup, null, "");
         assertEquals("Hellö Wörld!", doc.body().text());
     }
+
+
+    @Test public void lLoadsGzipFile() throws IOException {
+        File in = getFile("/htmltests/gzip.html.gz");
+        Document doc = Jsoup.parse(in, null);
+        assertEquals("Gzip test", doc.title());
+        assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
+    }
+
+    @Test public void loadsZGzipFile() throws IOException {
+        // compressed on win, with z suffix
+        File in = getFile("/htmltests/gzip.html.z");
+        Document doc = Jsoup.parse(in, null);
+        assertEquals("Gzip test", doc.title());
+        assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
+    }
+
+    @Test public void handlesFakeGzipFile () throws IOException {
+        File in = getFile("/htmltests/fake-gzip.html.gz");
+        Document doc = Jsoup.parse(in, null);
+        assertEquals("This is not gzipped", doc.title());
+        assertEquals("And should still be readable.", doc.selectFirst("p").text());
+    }
 }
diff --git a/src/test/java/org/jsoup/helper/W3CDomTest.java b/src/test/java/org/jsoup/helper/W3CDomTest.java
@@ -84,7 +84,7 @@ public void simpleConversion() {
 
     @Test
     public void convertsGoogle() throws IOException {
-        File in = ParseTest.getFile("/htmltests/google-ipod.html");
+        File in = ParseTest.getFile("/htmltests/google-ipod.html.gz");
         org.jsoup.nodes.Document doc = Jsoup.parse(in, "UTF8");
 
         W3CDom w3c = new W3CDom();
@@ -108,7 +108,7 @@ public void convertsGoogle() throws IOException {
 
     @Test
     public void convertsGoogleLocation() throws IOException {
-        File in = ParseTest.getFile("/htmltests/google-ipod.html");
+        File in = ParseTest.getFile("/htmltests/google-ipod.html.gz");
         org.jsoup.nodes.Document doc = Jsoup.parse(in, "UTF8");
 
         W3CDom w3c = new W3CDom();

diff --git a/src/test/java/org/jsoup/integration/ConnectTest.java b/src/test/java/org/jsoup/integration/ConnectTest.java
@@ -233,7 +233,7 @@ public void doesPut() throws IOException {
     @Test
     public void postFiles() throws IOException {
         File thumb = ParseTest.getFile("/htmltests/thumb.jpg");
-        File html = ParseTest.getFile("/htmltests/google-ipod.html");
+        File html = ParseTest.getFile("/htmltests/google-ipod.html.gz");
 
         Document res = Jsoup
             .connect(EchoServlet.Url)
@@ -247,8 +247,8 @@ public void postFiles() throws IOException {
 
         assertEquals("application/octet-stream", ihVal("Part secondPart ContentType", res));
         assertEquals("secondPart", ihVal("Part secondPart Name", res));
-        assertEquals("google-ipod.html", ihVal("Part secondPart Filename", res));
-        assertEquals("43963", ihVal("Part secondPart Size", res));
+        assertEquals("google-ipod.html.gz", ihVal("Part secondPart Filename", res));
+        assertEquals("12212", ihVal("Part secondPart Size", res));
 
         assertEquals("image/jpeg", ihVal("Part firstPart ContentType", res));
         assertEquals("firstPart", ihVal("Part firstPart Name", res));

diff --git a/src/test/java/org/jsoup/integration/ParseTest.java b/src/test/java/org/jsoup/integration/ParseTest.java
@@ -1,6 +1,7 @@
 package org.jsoup.integration;
 
 import org.jsoup.Jsoup;
+import org.jsoup.helper.DataUtil;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.parser.ParseErrorList;
@@ -11,8 +12,10 @@
 import java.io.*;
 import java.net.URISyntaxException;
 import java.net.URL;
+import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
+import java.util.zip.GZIPInputStream;
 
 import static org.junit.Assert.*;
 
@@ -25,7 +28,7 @@ public class ParseTest {
 
     @Test
     public void testSmhBizArticle() throws IOException {
-        File in = getFile("/htmltests/smh-biz-article-1.html");
+        File in = getFile("/htmltests/smh-biz-article-1.html.gz");
         Document doc = Jsoup.parse(in, "UTF-8",
                 "http://www.smh.com.au/business/the-boards-next-fear-the-female-quota-20100106-lteq.html");
         assertEquals("The board’s next fear: the female quota",
@@ -40,7 +43,7 @@ public void testSmhBizArticle() throws IOException {
 
     @Test
     public void testNewsHomepage() throws IOException {
-        File in = getFile("/htmltests/news-com-au-home.html");
+        File in = getFile("/htmltests/news-com-au-home.html.gz");
         Document doc = Jsoup.parse(in, "UTF-8", "http://www.news.com.au/");
         assertEquals("News.com.au | News from Australia and around the world online | NewsComAu", doc.title());
         assertEquals("Brace yourself for Metro meltdown", doc.select(".id1225817868581 h4").text().trim());
@@ -58,7 +61,7 @@ public void testNewsHomepage() throws IOException {
 
     @Test
     public void testGoogleSearchIpod() throws IOException {
-        File in = getFile("/htmltests/google-ipod.html");
+        File in = getFile("/htmltests/google-ipod.html.gz");
         Document doc = Jsoup.parse(in, "UTF-8", "http://www.google.com/search?hl=en&q=ipod&aq=f&oq=&aqi=g10");
         assertEquals("ipod - Google Search", doc.title());
         Elements results = doc.select("h3.r > a");
@@ -72,7 +75,7 @@ public void testGoogleSearchIpod() throws IOException {
 
     @Test
     public void testYahooJp() throws IOException {
-        File in = getFile("/htmltests/yahoo-jp.html");
+        File in = getFile("/htmltests/yahoo-jp.html.gz");
         Document doc = Jsoup.parse(in, "UTF-8", "http://www.yahoo.co.jp/index.html"); // http charset is utf-8.
         assertEquals("Yahoo! JAPAN", doc.title());
         Element a = doc.select("a[href=t/2322m2]").first();
@@ -150,7 +153,7 @@ public void testBrokenHtml5CharsetWithASingleDoubleQuote() throws IOException {
     @Test
     public void testNytArticle() throws IOException {
         // has tags like <nyt_text>
-        File in = getFile("/htmltests/nyt-article-1.html");
+        File in = getFile("/htmltests/nyt-article-1.html.gz");
         Document doc = Jsoup.parse(in, null, "http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp");
 
         Element headline = doc.select("nyt_headline[version=1.0]").first();
@@ -159,7 +162,7 @@ public void testNytArticle() throws IOException {
 
     @Test
     public void testYahooArticle() throws IOException {
-        File in = getFile("/htmltests/yahoo-article-1.html");
+        File in = getFile("/htmltests/yahoo-article-1.html.gz");
         Document doc = Jsoup.parse(in, "UTF-8", "http://news.yahoo.com/s/nm/20100831/bs_nm/us_gm_china");
         Element p = doc.select("p:contains(Volt will be sold in the United States)").first();
         assertEquals("In July, GM said its electric Chevrolet Volt will be sold in the United States at $41,000 -- $8,000 more than its nearest competitor, the Nissan Leaf.", p.text());
@@ -179,7 +182,7 @@ public void testLowercaseUtf8Charset() throws IOException {
     public void testXwiki() throws IOException {
         // https://github.com/jhy/jsoup/issues/1324
         // this tests that when in CharacterReader we hit a buffer while marked, we preserve the mark when buffered up and can rewind
-        File in = getFile("/htmltests/xwiki-1324.html");
+        File in = getFile("/htmltests/xwiki-1324.html.gz");
         Document doc = Jsoup.parse(in, null, "https://localhost/");
         assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
 
@@ -194,9 +197,9 @@ public void testXwikiExpanded() throws IOException {
         // https://github.com/jhy/jsoup/issues/1324
         // this tests that if there is a huge illegal character reference, we can get through a buffer and rewind, and still catch that it's an invalid refence,
         // and the parse tree is correct.
-        File in = getFile("/htmltests/xwiki-edit.html");
+        File in = getFile("/htmltests/xwiki-edit.html.gz");
         Parser parser = Parser.htmlParser();
-        Document doc = Jsoup.parse(new FileInputStream(in), "UTF-8", "https://localhost/", parser.setTrackErrors(100));
+        Document doc = Jsoup.parse(new GZIPInputStream(new FileInputStream(in)), "UTF-8", "https://localhost/", parser.setTrackErrors(100));
         ParseErrorList errors = parser.getErrors();
 
         assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
@@ -209,7 +212,7 @@ public void testXwikiExpanded() throws IOException {
     }
 
     @Test public void testWikiExpandedFromString() throws IOException {
-        File in = getFile("/htmltests/xwiki-edit.html");
+        File in = getFile("/htmltests/xwiki-edit.html.gz");
         String html = getFileAsString(in);
         Document doc = Jsoup.parse(html);
         assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
@@ -218,7 +221,7 @@ public void testXwikiExpanded() throws IOException {
     }
 
     @Test public void testWikiFromString() throws IOException {
-        File in = getFile("/htmltests/xwiki-1324.html");
+        File in = getFile("/htmltests/xwiki-1324.html.gz");
         String html = getFileAsString(in);
         Document doc = Jsoup.parse(html);
         assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
@@ -240,7 +243,14 @@ public static InputStream inputStreamFrom(String s) {
     }
 
     public static String getFileAsString(File file) throws IOException {
-        byte[] bytes = Files.readAllBytes(file.toPath());
+        byte[] bytes;
+        if (file.getName().endsWith(".gz")) {
+            InputStream stream = new GZIPInputStream(new FileInputStream(file));
+            ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0);
+            bytes = byteBuffer.array();
+        } else {
+            bytes = Files.readAllBytes(file.toPath());
+        }
         return new String(bytes);
     }
 

diff --git a/src/test/java/org/jsoup/nodes/DocumentTest.java b/src/test/java/org/jsoup/nodes/DocumentTest.java
@@ -100,13 +100,13 @@ public class DocumentTest {
     }
 
     @Test public void testLocation() throws IOException {
-    	File in = ParseTest.getFile("/htmltests/yahoo-jp.html");
+    	File in = ParseTest.getFile("/htmltests/yahoo-jp.html.gz");
         Document doc = Jsoup.parse(in, "UTF-8", "http://www.yahoo.co.jp/index.html");
         String location = doc.location();
         String baseUri = doc.baseUri();
         assertEquals("http://www.yahoo.co.jp/index.html",location);
         assertEquals("http://www.yahoo.co.jp/_ylh=X3oDMTB0NWxnaGxsBF9TAzIwNzcyOTYyNjUEdGlkAzEyBHRtcGwDZ2Ex/",baseUri);
-        in = ParseTest.getFile("/htmltests/nyt-article-1.html");
+        in = ParseTest.getFile("/htmltests/nyt-article-1.html.gz");
         doc = Jsoup.parse(in, null, "http://www.nytimes.com/2010/07/26/business/global/26bp.html?hp");
         location = doc.location();
         baseUri = doc.baseUri();

diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -1227,7 +1227,7 @@ public void testInvalidTableContents() throws IOException {
     }
 
     @Test public void characterReaderBuffer() throws IOException {
-        File in = ParseTest.getFile("/htmltests/character-reader-buffer.html");
+        File in = ParseTest.getFile("/htmltests/character-reader-buffer.html.gz");
         Document doc = Jsoup.parse(in, "UTF-8");
 
         String expectedHref = "http://www.domain.com/path?param_one=value&param_two=value";
@@ -1360,7 +1360,6 @@ public void testUNewlines() {
         String html = "\n<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n</body>\n</html>\n";
         Document doc = Jsoup.parse(html);
         doc.outputSettings().prettyPrint(false);
-        System.out.println(doc.outerHtml());
         assertEquals("<!doctype html>\n<html>\n<head>\n<title>Hello</title>\n</head>\n<body>\n<p>One</p>\n\n</body></html>\n", doc.outerHtml());
     }
 
@@ -1369,7 +1368,6 @@ public void testUNewlines() {
         // todo - ideally would move that space afer /html to the body when the There <p> is seen
         Document doc = Jsoup.parse(html);
         doc.outputSettings().prettyPrint(false);
-        System.out.println(doc.outerHtml());
         assertEquals("<html><head></head><body>One  <p>Hello!</p><p>There</p></body></html> ", doc.outerHtml());
     }
 }
diff --git a/src/test/resources/bomtests/bom_utf8.html.gz b/src/test/resources/bomtests/bom_utf8.html.gz