Skip to content

Commit

Permalink
[FEATURE] AbstractHtmlProcessor: HTML5 void tags
Browse files Browse the repository at this point in the history
Added support to `AbstractHtmlProcessor` for HTML5 self-closing tags not
recognized as such by PHP’s `DOMDocument` implementation.  In effect this is a
workaround for the issue reported in https://bugs.php.net/bug.php?id=73175.

Affected tags require a self-closing slash in the HTML input to `DOMDocument`
(e.g. `<wbr/>` rather than `<wbr>`), and their invalid corresponding closing tag
(e.g. `</wbr>`) removing from its HTML output.

Follows from discussion in #650.
  • Loading branch information
JakeQZ committed Feb 13, 2019
1 parent b83cea6 commit d7cb5ef
Show file tree
Hide file tree
Showing 2 changed files with 199 additions and 13 deletions.
50 changes: 46 additions & 4 deletions src/Emogrifier/HtmlProcessor/AbstractHtmlProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ abstract class AbstractHtmlProcessor
*/
const CONTENT_TYPE_META_TAG = '<meta http-equiv="Content-Type" content="text/html; charset=utf-8">';

/**
* @var string Regular expression part to match tag names that PHP's DOMDocument implementation is not aware are
* self-closing. These are mostly HTML5 elements, but for completeness <command> (obsolete) and <keygen>
* (deprecated) from the list at https://bugs.php.net/bug.php?id=73175 are included.
*/
const PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER = '(?:command|embed|keygen|source|track|wbr)';

/**
* @var \DOMDocument
*/
Expand Down Expand Up @@ -74,7 +81,7 @@ public function getDomDocument()
*/
public function render()
{
return $this->domDocument->saveHTML();
return $this->renderDomDocument();
}

/**
Expand All @@ -84,11 +91,27 @@ public function render()
*/
public function renderBodyContent()
{
$bodyNodeHtml = $this->domDocument->saveHTML($this->getBodyElement());
$bodyNodeHtml = $this->renderDomDocument($this->getBodyElement());

return \str_replace(['<body>', '</body>'], '', $bodyNodeHtml);
}

/**
* Renders the DOMDocument as HTML, eliminating any invalid closing tags for void elements.
*
* @param \DOMNode|null $node Optional parameter to output a subset of the document.
*
* @return string
*/
protected function renderDomDocument(\DOMNode $node = null)
{
return \preg_replace(
'%</' . static::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '>%',
'',
$this->domDocument->saveHTML($node)
);
}

/**
* Returns the BODY element.
*
Expand Down Expand Up @@ -137,7 +160,7 @@ private function createRawDomDocument($html)
}

/**
* Returns the HTML with added document type and Content-Type meta tag if needed,
* Returns the HTML with added document type, Content-Type meta tag, and self-closing slashes, if needed,
* ensuring that the HTML will be good for creating a DOM document from it.
*
* @param string $html
Expand All @@ -146,7 +169,9 @@ private function createRawDomDocument($html)
*/
private function prepareHtmlForDomConversion($html)
{
$htmlWithDocumentType = $this->ensureDocumentType($html);
$htmlWithSelfClosingSlashes = $this->ensurePhpUnrecognizedSelfClosingTagsAreXml($html);

$htmlWithDocumentType = $this->ensureDocumentType($htmlWithSelfClosingSlashes);

return $this->addContentTypeMetaTag($htmlWithDocumentType);
}
Expand Down Expand Up @@ -204,6 +229,23 @@ private function addContentTypeMetaTag($html)
return $reworkedHtml;
}

/**
* Makes sure that any self-closing tags not recognized as such by PHP's DOMDocument implementation have a
* self-closing slash.
*
* @param string $html
*
* @return string HTML with problematic tags converted.
*/
private function ensurePhpUnrecognizedSelfClosingTagsAreXml($html)
{
return \preg_replace(
'%<' . static::PHP_UNRECOGNIZED_VOID_TAGNAME_MATCHER . '\\b[^>]*+(?<!/)(?=>)%',
'$0/',
$html
);
}

/**
* Checks that $this->domDocument has a BODY element and adds it if it is missing.
*
Expand Down
162 changes: 153 additions & 9 deletions tests/Unit/Emogrifier/HtmlProcessor/AbstractHtmlProcessorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -302,36 +302,138 @@ public function notAddsSecondContentTypeMetaTag()
static::assertSame(1, $numberOfContentTypeMetaTags);
}

/**
* @return string[][]
*/
public function xmlSelfClosingTagDataProvider()
{
return [
'<br>' => ['<br/>', 'br'],
'<wbr>' => ['foo<wbr/>bar', 'wbr'],
'<embed>' => [
'<embed type="video/mp4" src="https://example.com/flower.mp4" width="250" height="200"/>',
'embed',
],
'<picture> with <source> and <img>' => [
'<picture><source srcset="https://example.com/flower-800x600.jpeg" media="(min-width: 600px)"/>'
. '<img src="https://example.com/flower-400x300.jpeg"/></picture>',
'source',
],
'<video> with <track>' => [
'<video controls width="250" src="https://example.com/flower.mp4">'
. '<track default kind="captions" srclang="en" src="https://example.com/flower.vtt"/></video>',
'track',
],
];
}

/**
* @return string[][]
*/
public function nonXmlSelfClosingTagDataProvider()
{
return \array_map(
function (array $dataset) {
$dataset[0] = \str_replace('/>', '>', $dataset[0]);
return $dataset;
},
$this->xmlSelfClosingTagDataProvider()
);
}

/**
* @return string[][] Each dataset has three elements in the following order:
* - HTML with non-XML self-closing tags (e.g. "...<br>...");
* - The equivalent HTML with XML self-closing tags (e.g. "...<br/>...");
* - The name of a self-closing tag contained in the HTML (e.g. "br").
*/
public function selfClosingTagDataProvider()
{
return \array_map(
function (array $dataset) {
\array_unshift($dataset, \str_replace('/>', '>', $dataset[0]));
return $dataset;
},
$this->xmlSelfClosingTagDataProvider()
);
}

/**
* @return string[][]
*/
public function documentTypeAndSelfClosingTagDataProvider()
{
$documentTypeDatasets = $this->documentTypeDataProvider();
$selfClosingTagDatasets = $this->selfClosingTagDataProvider();
$datasets = [];
foreach ($documentTypeDatasets as $documentTypeDatasetName => $documentTypeDataset) {
foreach ($selfClosingTagDatasets as $selfClosingTagDatasetName => $selfClosingTagDataset) {
$datasets[$documentTypeDatasetName . ' & ' . $selfClosingTagDatasetName]
= \array_merge($documentTypeDataset, $selfClosingTagDataset);
}
}
return $datasets;
}

/**
* @test
*
* @param string $documentType
* @param string $htmlWithNonXmlSelfClosingTags
* @param string $htmlWithXmlSelfClosingTags
*
* @dataProvider documentTypeDataProvider
* @dataProvider documentTypeAndSelfClosingTagDataProvider
*/
public function convertsXmlSelfClosingTagsToNonXmlSelfClosingTag($documentType)
{
$subject = new TestingHtmlProcessor($documentType . '<html><body><br/></body></html>');
public function convertsXmlSelfClosingTagsToNonXmlSelfClosingTag(
$documentType,
$htmlWithNonXmlSelfClosingTags,
$htmlWithXmlSelfClosingTags
) {
$subject = new TestingHtmlProcessor(
$documentType . '<html><body>' . $htmlWithXmlSelfClosingTags . '</body></html>'
);

$result = $subject->render();

static::assertContains('<body><br></body>', $result);
static::assertContains('<body>' . $htmlWithNonXmlSelfClosingTags . '</body>', $result);
}

/**
* @test
*
* @param string $documentType
* @param string $htmlWithNonXmlSelfClosingTags
*
* @dataProvider documentTypeDataProvider
* @dataProvider documentTypeAndSelfClosingTagDataProvider
*/
public function keepsNonXmlSelfClosingTags($documentType, $htmlWithNonXmlSelfClosingTags)
{
$subject = new TestingHtmlProcessor(
$documentType . '<html><body>' . $htmlWithNonXmlSelfClosingTags . '</body></html>'
);

$result = $subject->render();

static::assertContains('<body>' . $htmlWithNonXmlSelfClosingTags . '</body>', $result);
}

/**
* @test
*
* @param string $htmlWithNonXmlSelfClosingTags
* @param string $tagName
*
* @dataProvider nonXmlSelfClosingTagDataProvider
*/
public function keepsNonXmlSelfClosingTags($documentType)
public function notAddsClosingTagForSelfClosingTags($htmlWithNonXmlSelfClosingTags, $tagName)
{
$subject = new TestingHtmlProcessor($documentType . '<html><body><br></body></html>');
$subject = new TestingHtmlProcessor(
'<html><body>' . $htmlWithNonXmlSelfClosingTags . '</body></html>'
);

$result = $subject->render();

static::assertContains('<body><br></body>', $result);
static::assertNotContains('</' . $tagName, $result);
}

/**
Expand Down Expand Up @@ -359,6 +461,25 @@ public function renderBodyContentReturnsBodyContent()
static::assertSame($bodyContent, $result);
}

/**
* @test
*
* @param string $htmlWithNonXmlSelfClosingTags
* @param string $tagName
*
* @dataProvider nonXmlSelfClosingTagDataProvider
*/
public function renderBodyContentNotAddsClosingTagForSelfClosingTags($htmlWithNonXmlSelfClosingTags, $tagName)
{
$subject = new TestingHtmlProcessor(
'<html><body>' . $htmlWithNonXmlSelfClosingTags . '</body></html>'
);

$result = $subject->renderBodyContent();

static::assertNotContains('</' . $tagName, $result);
}

/**
* @test
*/
Expand All @@ -383,4 +504,27 @@ public function getDomDocumentWithNormalizedHtmlRepresentsTheGivenHtml()

self::assertSame($html, $domDocument->saveHTML());
}

/**
* @test
*
* @param string $htmlWithNonXmlSelfClosingTags
* @param string $tagName
*
* @dataProvider nonXmlSelfClosingTagDataProvider
*/
public function getDomDocumentVoidElementNotHasChildNodes($htmlWithNonXmlSelfClosingTags, $tagName)
{
$subject = new TestingHtmlProcessor(
// Append a 'trap' element that might become a child node if the HTML is parsed incorrectly
'<html><body>' . $htmlWithNonXmlSelfClosingTags . '<span>foo</span></body></html>'
);

$domDocument = $subject->getDomDocument();

$voidElements = $domDocument->getElementsByTagName($tagName);
foreach ($voidElements as $element) {
static::assertFalse($element->hasChildNodes());
}
}
}

0 comments on commit d7cb5ef

Please sign in to comment.