Skip to content

Commit

Permalink
Add BOM support to reader and improve tests
Browse files Browse the repository at this point in the history
Signed-off-by: Jarno Elovirta <jarno@elovirta.com>
  • Loading branch information
jelovirt committed May 9, 2024
1 parent 831acd4 commit 9c23b78
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 12 deletions.
32 changes: 25 additions & 7 deletions src/main/java/com/elovirta/dita/markdown/MarkdownReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.CharBuffer;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -351,18 +352,19 @@ Map.Entry<URI, Locator> getSchema(char[] data, InputSource input) throws SAXPars
@VisibleForTesting
char[] getMarkdownContent(final InputSource input) throws IOException {
final CharArrayWriter out = new CharArrayWriter();
final String encoding = input.getEncoding() != null ? input.getEncoding() : StandardCharsets.UTF_8.name();
final boolean isUtf8 = "UTF-8".equalsIgnoreCase(encoding);
if (input.getByteStream() != null) {
final String encoding = input.getEncoding() != null ? input.getEncoding() : "UTF-8";
try (
BufferedInputStream is = "UTF-8".equalsIgnoreCase(encoding)
BufferedInputStream is = isUtf8
? consumeBOM(input.getByteStream())
: new BufferedInputStream(input.getByteStream());
Reader in = new InputStreamReader(is, encoding)
) {
copy(in, out);
}
} else if (input.getCharacterStream() != null) {
try (Reader in = input.getCharacterStream()) {
try (Reader in = isUtf8 ? consumeBOM(input.getCharacterStream()) : input.getCharacterStream()) {
copy(in, out);
}
} else if (input.getSystemId() != null) {
Expand All @@ -372,11 +374,8 @@ char[] getMarkdownContent(final InputSource input) throws IOException {
} catch (final URISyntaxException e) {
throw new IllegalArgumentException(e);
}
final String encoding = input.getEncoding() != null ? input.getEncoding() : "UTF-8";
try (
BufferedInputStream is = "UTF-8".equalsIgnoreCase(encoding)
? consumeBOM(inUrl.openStream())
: new BufferedInputStream(inUrl.openStream());
BufferedInputStream is = isUtf8 ? consumeBOM(inUrl.openStream()) : new BufferedInputStream(inUrl.openStream());
Reader in = new InputStreamReader(is, encoding)
) {
copy(in, out);
Expand All @@ -403,4 +402,23 @@ private BufferedInputStream consumeBOM(final InputStream in) throws IOException
}
return bin;
}

/**
* Returns a reader that skips the BOM if present.
*
* @param in the original reader
* @return a reader without a possible BOM
*/
private BufferedReader consumeBOM(final Reader in) throws IOException {
final BufferedReader bin = new BufferedReader(in);
bin.mark(1);
try {
if (bin.read() != '\uFEFF') {
bin.reset();
}
} catch (final IOException e) {
bin.reset();
}
return bin;
}
}
40 changes: 35 additions & 5 deletions src/test/java/com/elovirta/dita/markdown/MarkdownReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,16 @@
import com.vladsch.flexmark.util.data.MutableDataSet;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Map;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.CsvSource;
import org.junit.jupiter.params.provider.ValueSource;
import org.xml.sax.*;
import org.xml.sax.helpers.XMLFilterImpl;
Expand Down Expand Up @@ -191,13 +195,39 @@ public void test_fail(String file) {
}
}

@Test
public void getMarkdownContent_url() throws Exception {
final String input = getSrc() + "testBOM.md";
@ParameterizedTest
@CsvSource({ "markdown/testBOM.md, UTF-8", "markdown/testNoBOM.md, UTF-8", "markdown/testNoBOM.md, ISO-8859-1" })
public void getMarkdownContent_url(String input, String encoding) throws Exception {
final URL in = getClass().getResource("/" + input);
final InputSource i = new InputSource(in.toString());
final char[] content = new MarkdownReader().getMarkdownContent(i);
assertEquals('W', content[0]);
i.setEncoding(encoding);
final char[] act = new MarkdownReader().getMarkdownContent(i);
assertEquals('W', act[0]);
}

@ParameterizedTest
@CsvSource({ "/markdown/testBOM.md, UTF-8", "/markdown/testNoBOM.md, UTF-8", "/markdown/testNoBOM.md, ISO-8859-1" })
public void getMarkdownContent_byteStream(String input, String encoding) throws Exception {
try (InputStream in = getClass().getResourceAsStream(input)) {
final InputSource i = new InputSource(in);
i.setEncoding(encoding);
final char[] act = new MarkdownReader().getMarkdownContent(i);
assertEquals('W', act[0]);
}
}

@ParameterizedTest
@CsvSource({ "/markdown/testBOM.md, UTF-8", "/markdown/testNoBOM.md, UTF-8", "/markdown/testNoBOM.md, ISO-8859-1" })
public void getMarkdownContent_characterStream(String input, String encoding) throws Exception {
try (
InputStream in = getClass().getResourceAsStream(input);
Reader r = new InputStreamReader(in, StandardCharsets.UTF_8)
) {
final InputSource i = new InputSource(r);
i.setEncoding(encoding);
final char[] act = new MarkdownReader().getMarkdownContent(i);
assertEquals('W', act[0]);
}
}

@Test
Expand Down

0 comments on commit 9c23b78

Please sign in to comment.