Skip to content

DefaultEntityResolver: use the HTML5 entities as fallback if no subset is found in getExternalSubset() #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions junit/io/sf/carte/doc/xml/dtd/DefaultEntityResolverTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,6 @@ public void getExternalSubsetStringString() throws SAXException, IOException {
Reader re = isrc.getCharacterStream();
assertNotNull(re);
re.close();
//
isrc = resolver.getExternalSubset("foo", null);
assertNull(isrc);
// SVG
isrc = resolver.getExternalSubset("svg", null);
assertNotNull(isrc);
Expand All @@ -58,6 +55,24 @@ public void getExternalSubsetStringString() throws SAXException, IOException {
re.close();
}

@Test
public void getExternalSubsetStringStringUnknownSubset() throws SAXException, IOException {
InputSource isrc = resolver.getExternalSubset("foo", null);
assertNotNull(isrc);
assertNull(isrc.getPublicId());
assertNull(isrc.getSystemId());
Reader re = isrc.getCharacterStream();
assertNotNull(re);
char[] cbuf = new char[40];
try {
re.read(cbuf);
} finally {
re.close();
}
String sbuf = new String(cbuf);
assertEquals("<!ENTITY Tab \"&#x9;\"><!ENTITY NewLine \"&", sbuf);
}

@Test
public void resolveEntityStringString() throws SAXException, IOException {
InputSource isrc = resolver.resolveEntity(DocumentTypeDeclaration.XHTML1_TRA_PUBLICID,
Expand Down
20 changes: 18 additions & 2 deletions junit/io/sf/carte/doc/xml/dtd/StackedEntityResolverTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,24 @@ public void getExternalSubsetStringString() throws SAXException, IOException {
Reader re = isrc.getCharacterStream();
assertNotNull(re);
re.close();
isrc = stackedResolver.getExternalSubset("foo", null);
assertNull(isrc);
}

@Test
public void getExternalSubsetStringStringUnknownSubset() throws SAXException, IOException {
InputSource isrc = stackedResolver.getExternalSubset("foo", null);
assertNotNull(isrc);
assertNull(isrc.getPublicId());
assertNull(isrc.getSystemId());
Reader re = isrc.getCharacterStream();
assertNotNull(re);
char[] cbuf = new char[40];
try {
re.read(cbuf);
} finally {
re.close();
}
String sbuf = new String(cbuf);
assertEquals("<!ENTITY Tab \"&#x9;\"><!ENTITY NewLine \"&", sbuf);
}

@Test
Expand Down
20 changes: 18 additions & 2 deletions junit/io/sf/carte/doc/xml/dtd/StackedEntityResolverTest2.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,24 @@ public void getExternalSubsetStringString() throws SAXException, IOException {
Reader re = isrc.getCharacterStream();
assertNotNull(re);
re.close();
isrc = stackedResolver.getExternalSubset("foo", null);
assertNull(isrc);
}

@Test
public void getExternalSubsetStringStringUnknownSubset() throws SAXException, IOException {
InputSource isrc = stackedResolver.getExternalSubset("foo", null);
assertNotNull(isrc);
assertNull(isrc.getPublicId());
assertNull(isrc.getSystemId());
Reader re = isrc.getCharacterStream();
assertNotNull(re);
char[] cbuf = new char[40];
try {
re.read(cbuf);
} finally {
re.close();
}
String sbuf = new String(cbuf);
assertEquals("<!ENTITY Tab \"&#x9;\"><!ENTITY NewLine \"&", sbuf);
}

@Test
Expand Down
91 changes: 85 additions & 6 deletions src/io/sf/carte/doc/xml/dtd/DefaultEntityResolver.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.ext.EntityResolver2;
import org.xml.sax.ext.LexicalHandler;

import io.sf.carte.util.agent.AgentUtil;

Expand Down Expand Up @@ -279,8 +280,89 @@ public void addHostToWhiteList(String fqdn) {
}
}

/**
* Allows applications to provide an external subset for documents that don't
* explicitly define one.
* <p>
* Documents with {@code DOCTYPE} declarations that omit an external subset can
* thus augment the declarations available for validation, entity processing,
* and attribute processing (normalization, defaulting, and reporting types
* including {@code ID}). This augmentation is reported through the
* {@link LexicalHandler#startDTD startDTD()} method as if the document text had
* originally included the external subset; this callback is made before any
* internal subset data or errors are reported.
* </p>
* <p>
* This method can also be used with documents that have no {@code DOCTYPE}
* declaration. When the root element is encountered but no {@code DOCTYPE}
* declaration has been seen, this method is invoked. If it returns a value for
* the external subset, that root element is declared to be the root element,
* giving the effect of splicing a {@code DOCTYPE} declaration at the end the
* prolog of a document that could not otherwise be valid. The sequence of
* parser callbacks in that case logically resembles this:
* </p>
*
* <pre>
* ... comments and PIs from the prolog (as usual)
* startDTD ("rootName", source.getPublicId (), source.getSystemId ());
* startEntity ("[dtd]");
* ... declarations, comments, and PIs from the external subset
* endEntity ("[dtd]");
* endDTD ();
* ... then the rest of the document (as usual)
* startElement (..., "rootName", ...);
* </pre>
*
* <p>
* Note that the {@code InputSource} gets no further resolution. Also, this
* method will never be used by a (non-validating) processor that is not
* including external parameter entities.
* </p>
* <p>
* Uses for this method include facilitating data validation when interoperating
* with XML processors that would always require undesirable network accesses
* for external entities, or which for other reasons adopt a "no DTDs" policy.
* </p>
* <p>
* <strong>Warning:</strong> returning an external subset modifies the input
* document. By providing definitions for general entities, it can make a
* malformed document appear to be well formed.
* </p>
*
* @param name Identifies the document root element. This name comes from a
* {@code DOCTYPE} declaration (where available) or from the
* actual root element.
* @param baseURI The document's base URI, serving as an additional hint for
* selecting the external subset. This is always an absolute URI,
* unless it is {@code null} because the {@code XMLReader} was
* given an {@code InputSource} without one.
*
* @return an {@code InputSource} object describing the new external subset to
* be used by the parser. If no specific subset could be determined, an
* input source describing the HTML5 entities is returned.
*
* @throws SAXException if either the provided arguments or the input
* source were invalid or not allowed.
* @throws java.io.IOException if an I/O problem was found while loading the
* input source.
*/
@Override
public InputSource getExternalSubset(String name, String baseURI) throws SAXException, IOException {
InputSource is = findExternalSubset(name, baseURI);
if (is == null) {
// Give the HTML5 entities as a fallback
String fname = systemIdToFilename.get("https://www.w3.org/TR/html5/entities.dtd");
Reader re = dtdLoader.loadDTDfromClasspath(loader, fname);
if (re != null) {
is = new InputSource(re);
} else {
throw new IOException("Could not find resource: " + fname);
}
}
return is;
}

private InputSource findExternalSubset(String name, String baseURI) throws SAXException, IOException {
InputSource is;
if ("html".equalsIgnoreCase(name)) {
is = resolveEntity("[dtd]", XHTML1_TRA_PUBLICID, baseURI, XHTML1_TRA_SYSTEMID);
Expand All @@ -291,7 +373,6 @@ public InputSource getExternalSubset(String name, String baseURI) throws SAXExce
is.setPublicId(null);
is.setSystemId(null);
} else {
// This method can return null safely: there is no SystemId URL to connect to.
is = null;
}
return is;
Expand Down Expand Up @@ -374,10 +455,7 @@ protected boolean registerSystemIdFilename(String systemId, String filename) {
* the XML specification to be the one associated with the
* "{@literal <}" starting the relevant declaration.
* @param systemId The system identifier of the external entity being
* referenced; either a relative or absolute URI. This is never
* {@code null} when invoked by a SAX2 parser; only declared
* entities, and any external subset, are resolved by such
* parsers.
* referenced; either a relative or absolute URI.
*
* @return an {@code InputSource} object describing the new input source to be
* used by the parser. This implementation never returns {@code null} if
Expand Down Expand Up @@ -461,7 +539,8 @@ public final InputSource resolveEntity(String name, String publicId, String base
InputStream is = con.getInputStream();
isrc.setCharacterStream(new InputStreamReader(is, charset));
} else {
isrc = getExternalSubset(name, baseURI);
isrc = findExternalSubset(name, baseURI);
// 'isrc' can be null safely: there is no SystemId URL to connect to
}
return isrc;
}
Expand Down