30
30
import org .xml .sax .InputSource ;
31
31
import org .xml .sax .SAXException ;
32
32
import org .xml .sax .ext .EntityResolver2 ;
33
+ import org .xml .sax .ext .LexicalHandler ;
33
34
34
35
import io .sf .carte .util .agent .AgentUtil ;
35
36
@@ -279,8 +280,89 @@ public void addHostToWhiteList(String fqdn) {
279
280
}
280
281
}
281
282
283
+ /**
284
+ * Allows applications to provide an external subset for documents that don't
285
+ * explicitly define one.
286
+ * <p>
287
+ * Documents with {@code DOCTYPE} declarations that omit an external subset can
288
+ * thus augment the declarations available for validation, entity processing,
289
+ * and attribute processing (normalization, defaulting, and reporting types
290
+ * including {@code ID}). This augmentation is reported through the
291
+ * {@link LexicalHandler#startDTD startDTD()} method as if the document text had
292
+ * originally included the external subset; this callback is made before any
293
+ * internal subset data or errors are reported.
294
+ * </p>
295
+ * <p>
296
+ * This method can also be used with documents that have no {@code DOCTYPE}
297
+ * declaration. When the root element is encountered but no {@code DOCTYPE}
298
+ * declaration has been seen, this method is invoked. If it returns a value for
299
+ * the external subset, that root element is declared to be the root element,
300
+ * giving the effect of splicing a {@code DOCTYPE} declaration at the end the
301
+ * prolog of a document that could not otherwise be valid. The sequence of
302
+ * parser callbacks in that case logically resembles this:
303
+ * </p>
304
+ *
305
+ * <pre>
306
+ * ... comments and PIs from the prolog (as usual)
307
+ * startDTD ("rootName", source.getPublicId (), source.getSystemId ());
308
+ * startEntity ("[dtd]");
309
+ * ... declarations, comments, and PIs from the external subset
310
+ * endEntity ("[dtd]");
311
+ * endDTD ();
312
+ * ... then the rest of the document (as usual)
313
+ * startElement (..., "rootName", ...);
314
+ * </pre>
315
+ *
316
+ * <p>
317
+ * Note that the {@code InputSource} gets no further resolution. Also, this
318
+ * method will never be used by a (non-validating) processor that is not
319
+ * including external parameter entities.
320
+ * </p>
321
+ * <p>
322
+ * Uses for this method include facilitating data validation when interoperating
323
+ * with XML processors that would always require undesirable network accesses
324
+ * for external entities, or which for other reasons adopt a "no DTDs" policy.
325
+ * </p>
326
+ * <p>
327
+ * <strong>Warning:</strong> returning an external subset modifies the input
328
+ * document. By providing definitions for general entities, it can make a
329
+ * malformed document appear to be well formed.
330
+ * </p>
331
+ *
332
+ * @param name Identifies the document root element. This name comes from a
333
+ * {@code DOCTYPE} declaration (where available) or from the
334
+ * actual root element.
335
+ * @param baseURI The document's base URI, serving as an additional hint for
336
+ * selecting the external subset. This is always an absolute URI,
337
+ * unless it is {@code null} because the {@code XMLReader} was
338
+ * given an {@code InputSource} without one.
339
+ *
340
+ * @return an {@code InputSource} object describing the new external subset to
341
+ * be used by the parser. If no specific subset could be determined, an
342
+ * input source describing the HTML5 entities is returned.
343
+ *
344
+ * @throws SAXException if either the provided arguments or the input
345
+ * source were invalid or not allowed.
346
+ * @throws java.io.IOException if an I/O problem was found while loading the
347
+ * input source.
348
+ */
282
349
@ Override
283
350
public InputSource getExternalSubset (String name , String baseURI ) throws SAXException , IOException {
351
+ InputSource is = findExternalSubset (name , baseURI );
352
+ if (is == null ) {
353
+ // Give the HTML5 entities as a fallback
354
+ String fname = systemIdToFilename .get ("https://www.w3.org/TR/html5/entities.dtd" );
355
+ Reader re = dtdLoader .loadDTDfromClasspath (loader , fname );
356
+ if (re != null ) {
357
+ is = new InputSource (re );
358
+ } else {
359
+ throw new IOException ("Could not find resource: " + fname );
360
+ }
361
+ }
362
+ return is ;
363
+ }
364
+
365
+ private InputSource findExternalSubset (String name , String baseURI ) throws SAXException , IOException {
284
366
InputSource is ;
285
367
if ("html" .equalsIgnoreCase (name )) {
286
368
is = resolveEntity ("[dtd]" , XHTML1_TRA_PUBLICID , baseURI , XHTML1_TRA_SYSTEMID );
@@ -291,7 +373,6 @@ public InputSource getExternalSubset(String name, String baseURI) throws SAXExce
291
373
is .setPublicId (null );
292
374
is .setSystemId (null );
293
375
} else {
294
- // This method can return null safely: there is no SystemId URL to connect to.
295
376
is = null ;
296
377
}
297
378
return is ;
@@ -374,10 +455,7 @@ protected boolean registerSystemIdFilename(String systemId, String filename) {
374
455
* the XML specification to be the one associated with the
375
456
* "{@literal <}" starting the relevant declaration.
376
457
* @param systemId The system identifier of the external entity being
377
- * referenced; either a relative or absolute URI. This is never
378
- * {@code null} when invoked by a SAX2 parser; only declared
379
- * entities, and any external subset, are resolved by such
380
- * parsers.
458
+ * referenced; either a relative or absolute URI.
381
459
*
382
460
* @return an {@code InputSource} object describing the new input source to be
383
461
* used by the parser. This implementation never returns {@code null} if
@@ -461,7 +539,8 @@ public final InputSource resolveEntity(String name, String publicId, String base
461
539
InputStream is = con .getInputStream ();
462
540
isrc .setCharacterStream (new InputStreamReader (is , charset ));
463
541
} else {
464
- isrc = getExternalSubset (name , baseURI );
542
+ isrc = findExternalSubset (name , baseURI );
543
+ // 'isrc' can be null safely: there is no SystemId URL to connect to
465
544
}
466
545
return isrc ;
467
546
}
0 commit comments