Skip to content

Commit

Permalink
use html parsing method from Symfony 5.4
Browse files Browse the repository at this point in the history
should improve compatibility with PHP 8.3 because it no longer uses
deprecated mb_convert_encoding
  • Loading branch information
wasinger committed Dec 7, 2023
1 parent 5147809 commit 49db1c7
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ jobs:

strategy:
matrix:
php: [7.4, 8.0, 8.1, 8.2]
php: [7.4, 8.0, 8.1, 8.2, 8.3]
dependency-version: [prefer-lowest, prefer-stable]

steps:
Expand Down
69 changes: 51 additions & 18 deletions src/Helpers.php
Original file line number Diff line number Diff line change
Expand Up @@ -82,28 +82,61 @@ public static function getBodyNodeFromHtmlFragment($html, $charset = 'UTF-8')

public static function loadHtml(string $html, $charset = 'UTF-8'): \DOMDocument
{
$unsafeLibXml = \LIBXML_VERSION < 20900;
$current = libxml_use_internal_errors(true);
if($unsafeLibXml) {
return self::parseXhtml($html, $charset);
}
/**
* Function originally taken from Symfony\Component\DomCrawler\Crawler
* (c) Fabien Potencier <fabien@symfony.com>
* License: MIT
*/
private static function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
{
$htmlContent = self::convertToHtmlEntities($htmlContent, $charset);

$internalErrors = libxml_use_internal_errors(true);
if (\LIBXML_VERSION < 20900) {
$disableEntities = libxml_disable_entity_loader(true);
}
$d = new \DOMDocument('1.0', $charset);
$d->validateOnParse = true;
if (function_exists('mb_convert_encoding') && in_array(
strtolower($charset),
array_map('strtolower', mb_list_encodings())
)
) {
$html = mb_convert_encoding($html, 'HTML-ENTITIES', $charset);

$dom = new \DOMDocument('1.0', $charset);
$dom->validateOnParse = true;

if ('' !== trim($htmlContent)) {
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
// Option LIBXML_SCHEMA_CREATE seems to prevent this
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
@$dom->loadHTML($htmlContent, \LIBXML_SCHEMA_CREATE);
}
// PHP DOMDocument->loadHTML method tends to "eat" closing tags in html strings within script elements
// Option LIBXML_SCHEMA_CREATE seems to prevent this
// see https://stackoverflow.com/questions/24575136/domdocument-removes-html-tags-in-javascript-string
@$d->loadHTML($html, \LIBXML_SCHEMA_CREATE);
libxml_use_internal_errors($current);
if($unsafeLibXml) {

libxml_use_internal_errors($internalErrors);
if (\LIBXML_VERSION < 20900) {
libxml_disable_entity_loader($disableEntities);
}
return $d;

return $dom;
}

/**
* Converts charset to HTML-entities to ensure valid parsing.
* Function taken from Symfony\Component\DomCrawler\Crawler
* (c) Fabien Potencier <fabien@symfony.com>
* License: MIT
*/
private static function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
{
set_error_handler(function () { throw new \Exception(); });

try {
return mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], $charset);
} catch (\Exception|\ValueError $e) {
try {
$htmlContent = iconv($charset, 'UTF-8', $htmlContent);
$htmlContent = mb_encode_numericentity($htmlContent, [0x80, 0x10FFFF, 0, 0x1FFFFF], 'UTF-8');
} catch (\Exception|\ValueError $e) {
}
return $htmlContent;
} finally {
restore_error_handler();
}
}
}

0 comments on commit 49db1c7

Please sign in to comment.