Add BOM support to reader and improve tests

Signed-off-by: Jarno Elovirta <jarno@elovirta.com>
jelovirt · May 9, 2024 · 9c23b78 · 9c23b78
1 parent 831acd4
commit 9c23b78
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 12 deletions.
diff --git a/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java b/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java
@@ -27,6 +27,7 @@
 import java.net.URISyntaxException;
 import java.net.URL;
 import java.nio.CharBuffer;
+import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -351,18 +352,19 @@ Map.Entry<URI, Locator> getSchema(char[] data, InputSource input) throws SAXPars
   @VisibleForTesting
   char[] getMarkdownContent(final InputSource input) throws IOException {
     final CharArrayWriter out = new CharArrayWriter();
+    final String encoding = input.getEncoding() != null ? input.getEncoding() : StandardCharsets.UTF_8.name();
+    final boolean isUtf8 = "UTF-8".equalsIgnoreCase(encoding);
     if (input.getByteStream() != null) {
-      final String encoding = input.getEncoding() != null ? input.getEncoding() : "UTF-8";
       try (
-        BufferedInputStream is = "UTF-8".equalsIgnoreCase(encoding)
+        BufferedInputStream is = isUtf8
           ? consumeBOM(input.getByteStream())
           : new BufferedInputStream(input.getByteStream());
         Reader in = new InputStreamReader(is, encoding)
       ) {
         copy(in, out);
       }
     } else if (input.getCharacterStream() != null) {
-      try (Reader in = input.getCharacterStream()) {
+      try (Reader in = isUtf8 ? consumeBOM(input.getCharacterStream()) : input.getCharacterStream()) {
         copy(in, out);
       }
     } else if (input.getSystemId() != null) {
@@ -372,11 +374,8 @@ char[] getMarkdownContent(final InputSource input) throws IOException {
       } catch (final URISyntaxException e) {
         throw new IllegalArgumentException(e);
       }
-      final String encoding = input.getEncoding() != null ? input.getEncoding() : "UTF-8";
       try (
-        BufferedInputStream is = "UTF-8".equalsIgnoreCase(encoding)
-          ? consumeBOM(inUrl.openStream())
-          : new BufferedInputStream(inUrl.openStream());
+        BufferedInputStream is = isUtf8 ? consumeBOM(inUrl.openStream()) : new BufferedInputStream(inUrl.openStream());
         Reader in = new InputStreamReader(is, encoding)
       ) {
         copy(in, out);
@@ -403,4 +402,23 @@ private BufferedInputStream consumeBOM(final InputStream in) throws IOException
     }
     return bin;
   }
+
+  /**
+   * Returns a reader that skips the BOM if present.
+   *
+   * @param in the original reader
+   * @return a reader without a possible BOM
+   */
+  private BufferedReader consumeBOM(final Reader in) throws IOException {
+    final BufferedReader bin = new BufferedReader(in);
+    bin.mark(1);
+    try {
+      if (bin.read() != '\uFEFF') {
+        bin.reset();
+      }
+    } catch (final IOException e) {
+      bin.reset();
+    }
+    return bin;
+  }
 }
diff --git a/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java b/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java
@@ -10,12 +10,16 @@
 import com.vladsch.flexmark.util.data.MutableDataSet;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
 import java.net.URI;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.Map;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
 import org.junit.jupiter.params.provider.ValueSource;
 import org.xml.sax.*;
 import org.xml.sax.helpers.XMLFilterImpl;
@@ -191,13 +195,39 @@ public void test_fail(String file) {
     }
   }
 
-  @Test
-  public void getMarkdownContent_url() throws Exception {
-    final String input = getSrc() + "testBOM.md";
+  @ParameterizedTest
+  @CsvSource({ "markdown/testBOM.md, UTF-8", "markdown/testNoBOM.md, UTF-8", "markdown/testNoBOM.md, ISO-8859-1" })
+  public void getMarkdownContent_url(String input, String encoding) throws Exception {
     final URL in = getClass().getResource("/" + input);
     final InputSource i = new InputSource(in.toString());
-    final char[] content = new MarkdownReader().getMarkdownContent(i);
-    assertEquals('W', content[0]);
+    i.setEncoding(encoding);
+    final char[] act = new MarkdownReader().getMarkdownContent(i);
+    assertEquals('W', act[0]);
+  }
+
+  @ParameterizedTest
+  @CsvSource({ "/markdown/testBOM.md, UTF-8", "/markdown/testNoBOM.md, UTF-8", "/markdown/testNoBOM.md, ISO-8859-1" })
+  public void getMarkdownContent_byteStream(String input, String encoding) throws Exception {
+    try (InputStream in = getClass().getResourceAsStream(input)) {
+      final InputSource i = new InputSource(in);
+      i.setEncoding(encoding);
+      final char[] act = new MarkdownReader().getMarkdownContent(i);
+      assertEquals('W', act[0]);
+    }
+  }
+
+  @ParameterizedTest
+  @CsvSource({ "/markdown/testBOM.md, UTF-8", "/markdown/testNoBOM.md, UTF-8", "/markdown/testNoBOM.md, ISO-8859-1" })
+  public void getMarkdownContent_characterStream(String input, String encoding) throws Exception {
+    try (
+      InputStream in = getClass().getResourceAsStream(input);
+      Reader r = new InputStreamReader(in, StandardCharsets.UTF_8)
+    ) {
+      final InputSource i = new InputSource(r);
+      i.setEncoding(encoding);
+      final char[] act = new MarkdownReader().getMarkdownContent(i);
+      assertEquals('W', act[0]);
+    }
   }
 
   @Test