diff --git a/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java b/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java index d207a32..867e344 100644 --- a/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java +++ b/src/main/java/com/elovirta/dita/markdown/MarkdownReader.java @@ -27,6 +27,7 @@ import java.net.URISyntaxException; import java.net.URL; import java.nio.CharBuffer; +import java.nio.charset.StandardCharsets; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -351,10 +352,11 @@ Map.Entry getSchema(char[] data, InputSource input) throws SAXPars @VisibleForTesting char[] getMarkdownContent(final InputSource input) throws IOException { final CharArrayWriter out = new CharArrayWriter(); + final String encoding = input.getEncoding() != null ? input.getEncoding() : StandardCharsets.UTF_8.name(); + final boolean isUtf8 = "UTF-8".equalsIgnoreCase(encoding); if (input.getByteStream() != null) { - final String encoding = input.getEncoding() != null ? input.getEncoding() : "UTF-8"; try ( - BufferedInputStream is = "UTF-8".equalsIgnoreCase(encoding) + BufferedInputStream is = isUtf8 ? consumeBOM(input.getByteStream()) : new BufferedInputStream(input.getByteStream()); Reader in = new InputStreamReader(is, encoding) @@ -362,7 +364,7 @@ char[] getMarkdownContent(final InputSource input) throws IOException { copy(in, out); } } else if (input.getCharacterStream() != null) { - try (Reader in = input.getCharacterStream()) { + try (Reader in = isUtf8 ? consumeBOM(input.getCharacterStream()) : input.getCharacterStream()) { copy(in, out); } } else if (input.getSystemId() != null) { @@ -372,11 +374,8 @@ char[] getMarkdownContent(final InputSource input) throws IOException { } catch (final URISyntaxException e) { throw new IllegalArgumentException(e); } - final String encoding = input.getEncoding() != null ? input.getEncoding() : "UTF-8"; try ( - BufferedInputStream is = "UTF-8".equalsIgnoreCase(encoding) - ? consumeBOM(inUrl.openStream()) - : new BufferedInputStream(inUrl.openStream()); + BufferedInputStream is = isUtf8 ? consumeBOM(inUrl.openStream()) : new BufferedInputStream(inUrl.openStream()); Reader in = new InputStreamReader(is, encoding) ) { copy(in, out); @@ -403,4 +402,23 @@ private BufferedInputStream consumeBOM(final InputStream in) throws IOException } return bin; } + + /** + * Returns a reader that skips the BOM if present. + * + * @param in the original reader + * @return a reader without a possible BOM + */ + private BufferedReader consumeBOM(final Reader in) throws IOException { + final BufferedReader bin = new BufferedReader(in); + bin.mark(1); + try { + if (bin.read() != '\uFEFF') { + bin.reset(); + } + } catch (final IOException e) { + bin.reset(); + } + return bin; + } } diff --git a/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java b/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java index f454b1d..3b8e3c3 100644 --- a/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java +++ b/src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java @@ -10,12 +10,16 @@ import com.vladsch.flexmark.util.data.MutableDataSet; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; import java.net.URI; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Map; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.CsvSource; import org.junit.jupiter.params.provider.ValueSource; import org.xml.sax.*; import org.xml.sax.helpers.XMLFilterImpl; @@ -191,13 +195,39 @@ public void test_fail(String file) { } } - @Test - public void getMarkdownContent_url() throws Exception { - final String input = getSrc() + "testBOM.md"; + @ParameterizedTest + @CsvSource({ "markdown/testBOM.md, UTF-8", "markdown/testNoBOM.md, UTF-8", "markdown/testNoBOM.md, ISO-8859-1" }) + public void getMarkdownContent_url(String input, String encoding) throws Exception { final URL in = getClass().getResource("/" + input); final InputSource i = new InputSource(in.toString()); - final char[] content = new MarkdownReader().getMarkdownContent(i); - assertEquals('W', content[0]); + i.setEncoding(encoding); + final char[] act = new MarkdownReader().getMarkdownContent(i); + assertEquals('W', act[0]); + } + + @ParameterizedTest + @CsvSource({ "/markdown/testBOM.md, UTF-8", "/markdown/testNoBOM.md, UTF-8", "/markdown/testNoBOM.md, ISO-8859-1" }) + public void getMarkdownContent_byteStream(String input, String encoding) throws Exception { + try (InputStream in = getClass().getResourceAsStream(input)) { + final InputSource i = new InputSource(in); + i.setEncoding(encoding); + final char[] act = new MarkdownReader().getMarkdownContent(i); + assertEquals('W', act[0]); + } + } + + @ParameterizedTest + @CsvSource({ "/markdown/testBOM.md, UTF-8", "/markdown/testNoBOM.md, UTF-8", "/markdown/testNoBOM.md, ISO-8859-1" }) + public void getMarkdownContent_characterStream(String input, String encoding) throws Exception { + try ( + InputStream in = getClass().getResourceAsStream(input); + Reader r = new InputStreamReader(in, StandardCharsets.UTF_8) + ) { + final InputSource i = new InputSource(r); + i.setEncoding(encoding); + final char[] act = new MarkdownReader().getMarkdownContent(i); + assertEquals('W', act[0]); + } } @Test