Skip to content

Commit cd65fdc

Browse files
committed
Use the HTML5 entities as fallback if no subset is found in getExternalSubset
See #3.
1 parent 35e5568 commit cd65fdc

File tree

4 files changed

+137
-13
lines changed

4 files changed

+137
-13
lines changed

junit/io/sf/carte/doc/xml/dtd/DefaultEntityResolverTest.java

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,6 @@ public void getExternalSubsetStringString() throws SAXException, IOException {
4545
Reader re = isrc.getCharacterStream();
4646
assertNotNull(re);
4747
re.close();
48-
//
49-
isrc = resolver.getExternalSubset("foo", null);
50-
assertNull(isrc);
5148
// SVG
5249
isrc = resolver.getExternalSubset("svg", null);
5350
assertNotNull(isrc);
@@ -58,6 +55,24 @@ public void getExternalSubsetStringString() throws SAXException, IOException {
5855
re.close();
5956
}
6057

58+
@Test
59+
public void getExternalSubsetStringStringUnknownSubset() throws SAXException, IOException {
60+
InputSource isrc = resolver.getExternalSubset("foo", null);
61+
assertNotNull(isrc);
62+
assertNull(isrc.getPublicId());
63+
assertNull(isrc.getSystemId());
64+
Reader re = isrc.getCharacterStream();
65+
assertNotNull(re);
66+
char[] cbuf = new char[40];
67+
try {
68+
re.read(cbuf);
69+
} finally {
70+
re.close();
71+
}
72+
String sbuf = new String(cbuf);
73+
assertEquals("<!ENTITY Tab \"&#x9;\"><!ENTITY NewLine \"&", sbuf);
74+
}
75+
6176
@Test
6277
public void resolveEntityStringString() throws SAXException, IOException {
6378
InputSource isrc = resolver.resolveEntity(DocumentTypeDeclaration.XHTML1_TRA_PUBLICID,

junit/io/sf/carte/doc/xml/dtd/StackedEntityResolverTest.java

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,24 @@ public void getExternalSubsetStringString() throws SAXException, IOException {
4545
Reader re = isrc.getCharacterStream();
4646
assertNotNull(re);
4747
re.close();
48-
isrc = stackedResolver.getExternalSubset("foo", null);
49-
assertNull(isrc);
48+
}
49+
50+
@Test
51+
public void getExternalSubsetStringStringUnknownSubset() throws SAXException, IOException {
52+
InputSource isrc = stackedResolver.getExternalSubset("foo", null);
53+
assertNotNull(isrc);
54+
assertNull(isrc.getPublicId());
55+
assertNull(isrc.getSystemId());
56+
Reader re = isrc.getCharacterStream();
57+
assertNotNull(re);
58+
char[] cbuf = new char[40];
59+
try {
60+
re.read(cbuf);
61+
} finally {
62+
re.close();
63+
}
64+
String sbuf = new String(cbuf);
65+
assertEquals("<!ENTITY Tab \"&#x9;\"><!ENTITY NewLine \"&", sbuf);
5066
}
5167

5268
@Test

junit/io/sf/carte/doc/xml/dtd/StackedEntityResolverTest2.java

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,24 @@ public void getExternalSubsetStringString() throws SAXException, IOException {
4545
Reader re = isrc.getCharacterStream();
4646
assertNotNull(re);
4747
re.close();
48-
isrc = stackedResolver.getExternalSubset("foo", null);
49-
assertNull(isrc);
48+
}
49+
50+
@Test
51+
public void getExternalSubsetStringStringUnknownSubset() throws SAXException, IOException {
52+
InputSource isrc = stackedResolver.getExternalSubset("foo", null);
53+
assertNotNull(isrc);
54+
assertNull(isrc.getPublicId());
55+
assertNull(isrc.getSystemId());
56+
Reader re = isrc.getCharacterStream();
57+
assertNotNull(re);
58+
char[] cbuf = new char[40];
59+
try {
60+
re.read(cbuf);
61+
} finally {
62+
re.close();
63+
}
64+
String sbuf = new String(cbuf);
65+
assertEquals("<!ENTITY Tab \"&#x9;\"><!ENTITY NewLine \"&", sbuf);
5066
}
5167

5268
@Test

src/io/sf/carte/doc/xml/dtd/DefaultEntityResolver.java

Lines changed: 83 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import org.xml.sax.InputSource;
3131
import org.xml.sax.SAXException;
3232
import org.xml.sax.ext.EntityResolver2;
33+
import org.xml.sax.ext.LexicalHandler;
3334

3435
import io.sf.carte.util.agent.AgentUtil;
3536

@@ -279,8 +280,88 @@ public void addHostToWhiteList(String fqdn) {
279280
}
280281
}
281282

283+
/**
284+
* Allows applications to provide an external subset for documents that don't
285+
* explicitly define one.
286+
* <p>
287+
* Documents with {@code DOCTYPE} declarations that omit an external subset can
288+
* thus augment the declarations available for validation, entity processing,
289+
* and attribute processing (normalization, defaulting, and reporting types
290+
* including {@code ID}). This augmentation is reported through the
291+
* {@link LexicalHandler#startDTD startDTD()} method as if the document text had
292+
* originally included the external subset; this callback is made before any
293+
* internal subset data or errors are reported.
294+
* </p>
295+
* <p>
296+
* This method can also be used with documents that have no {@code DOCTYPE}
297+
* declaration. When the root element is encountered but no {@code DOCTYPE}
298+
* declaration has been seen, this method is invoked. If it returns a value for
299+
* the external subset, that root element is declared to be the root element,
300+
* giving the effect of splicing a {@code DOCTYPE} declaration at the end the
301+
* prolog of a document that could not otherwise be valid. The sequence of
302+
* parser callbacks in that case logically resembles this:
303+
* </p>
304+
*
305+
* <pre>
306+
* ... comments and PIs from the prolog (as usual)
307+
* startDTD ("rootName", source.getPublicId (), source.getSystemId ());
308+
* startEntity ("[dtd]");
309+
* ... declarations, comments, and PIs from the external subset
310+
* endEntity ("[dtd]");
311+
* endDTD ();
312+
* ... then the rest of the document (as usual)
313+
* startElement (..., "rootName", ...);
314+
* </pre>
315+
*
316+
* <p>
317+
* Note that the {@code InputSource} gets no further resolution. Also, this
318+
* method will never be used by a (non-validating) processor that is not
319+
* including external parameter entities.
320+
* </p>
321+
* <p>
322+
* Uses for this method include facilitating data validation when interoperating
323+
* with XML processors that would always require undesirable network accesses
324+
* for external entities, or which for other reasons adopt a "no DTDs" policy.
325+
* </p>
326+
* <p>
327+
* <strong>Warning:</strong> returning an external subset modifies the input
328+
* document. By providing definitions for general entities, it can make a
329+
* malformed document appear to be well formed.
330+
* </p>
331+
*
332+
* @param name Identifies the document root element. This name comes from a
333+
* {@code DOCTYPE} declaration (where available) or from the
334+
* actual root element.
335+
* @param baseURI The document's base URI, serving as an additional hint for
336+
* selecting the external subset. This is always an absolute URI,
337+
* unless it is {@code null} because the {@code XMLReader} was
338+
* given an {@code InputSource} without one.
339+
*
340+
* @return an {@code InputSource} object describing the new external subset to
341+
* be used by the parser. If no specific subset could be determined, an
342+
* input source describing the HTML5 entities is returned.
343+
*
344+
* @throws SAXException if either the provided arguments or the input
345+
* source were invalid or not allowed.
346+
* @throws java.io.IOException if an I/O problem was found while loading the
347+
* input source.
348+
*/
282349
@Override
283350
public InputSource getExternalSubset(String name, String baseURI) throws SAXException, IOException {
351+
InputSource is = findExternalSubset(name, baseURI);
352+
if (is == null) {
353+
String fname = systemIdToFilename.get("https://www.w3.org/TR/html5/entities.dtd");
354+
Reader re = dtdLoader.loadDTDfromClasspath(loader, fname);
355+
if (re != null) {
356+
is = new InputSource(re);
357+
} else {
358+
throw new IOException("Could not find resource: " + fname);
359+
}
360+
}
361+
return is;
362+
}
363+
364+
private InputSource findExternalSubset(String name, String baseURI) throws SAXException, IOException {
284365
InputSource is;
285366
if ("html".equalsIgnoreCase(name)) {
286367
is = resolveEntity("[dtd]", XHTML1_TRA_PUBLICID, baseURI, XHTML1_TRA_SYSTEMID);
@@ -291,7 +372,6 @@ public InputSource getExternalSubset(String name, String baseURI) throws SAXExce
291372
is.setPublicId(null);
292373
is.setSystemId(null);
293374
} else {
294-
// This method can return null safely: there is no SystemId URL to connect to.
295375
is = null;
296376
}
297377
return is;
@@ -374,10 +454,7 @@ protected boolean registerSystemIdFilename(String systemId, String filename) {
374454
* the XML specification to be the one associated with the
375455
* "{@literal <}" starting the relevant declaration.
376456
* @param systemId The system identifier of the external entity being
377-
* referenced; either a relative or absolute URI. This is never
378-
* {@code null} when invoked by a SAX2 parser; only declared
379-
* entities, and any external subset, are resolved by such
380-
* parsers.
457+
* referenced; either a relative or absolute URI.
381458
*
382459
* @return an {@code InputSource} object describing the new input source to be
383460
* used by the parser. This implementation never returns {@code null} if
@@ -461,7 +538,7 @@ public final InputSource resolveEntity(String name, String publicId, String base
461538
InputStream is = con.getInputStream();
462539
isrc.setCharacterStream(new InputStreamReader(is, charset));
463540
} else {
464-
isrc = getExternalSubset(name, baseURI);
541+
isrc = findExternalSubset(name, baseURI);
465542
}
466543
return isrc;
467544
}

0 commit comments

Comments
 (0)