|
73 | 73 | // -----------------------------------------------------------------------------
|
74 | 74 | // get html dom from file
|
75 | 75 | // $maxlen is defined in the code as PHP_STREAM_COPY_ALL which is defined as -1.
|
76 |
| -function file_get_html($url, $use_include_path = false, $context=null, $offset=0, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
| 76 | +function file_get_html($url, $use_include_path = false, $context=null, $offset = -1, $maxLen=-1, $lowercase = true, $forceTagsClosed=true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT) |
77 | 77 | {
|
78 | 78 | // We DO force the tags to be terminated.
|
79 | 79 | $dom = new simple_html_dom(null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText);
|
@@ -693,7 +693,7 @@ protected function parse_selector($selector_string) {
|
693 | 693 | // This implies that an html attribute specifier may start with an @ sign that is NOT captured by the expression.
|
694 | 694 | // farther study is required to determine of this should be documented or removed.
|
695 | 695 | // $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is";
|
696 |
| - $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[@?(!?[\w-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; |
| 696 | + $pattern = "/([\w\-:\*]*)(?:\#([\w\-]+)|\.([\w\-]+))?(?:\[@?(!?[\w\-:]+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([\/, ]+)/is"; |
697 | 697 | preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
|
698 | 698 | if (is_object($debugObject)) {$debugObject->debugLog(2, "Matches Array: ", $matches);}
|
699 | 699 |
|
@@ -821,11 +821,11 @@ function convert_text($text)
|
821 | 821 | }
|
822 | 822 |
|
823 | 823 | /**
|
824 |
| - * Returns true if $string is valid UTF-8 and false otherwise. |
825 |
| - * |
826 |
| - * @param mixed $str String to be tested |
827 |
| - * @return boolean |
828 |
| - */ |
| 824 | + * Returns true if $string is valid UTF-8 and false otherwise. |
| 825 | + * |
| 826 | + * @param mixed $str String to be tested |
| 827 | + * @return boolean |
| 828 | + */ |
829 | 829 | static function is_utf8($str)
|
830 | 830 | {
|
831 | 831 | $c=0; $b=0;
|
@@ -899,9 +899,9 @@ function get_display_size()
|
899 | 899 | {
|
900 | 900 | // Thanks to user gnarf from stackoverflow for this regular expression.
|
901 | 901 | $attributes = array();
|
902 |
| - preg_match_all("/([\w-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); |
| 902 | + preg_match_all("/([\w\-]+)\s*:\s*([^;]+)\s*;?/", $this->attr['style'], $matches, PREG_SET_ORDER); |
903 | 903 | foreach ($matches as $match) {
|
904 |
| - $attributes[$match[1]] = $match[2]; |
| 904 | + $attributes[$match[1]] = $match[2]; |
905 | 905 | }
|
906 | 906 |
|
907 | 907 | // If there is a width in the style attributes:
|
@@ -947,13 +947,13 @@ function get_display_size()
|
947 | 947 | // If the class or id is specified in a SEPARATE css file thats not on the page, go get it and do what we were just doing for the ones on the page.
|
948 | 948 |
|
949 | 949 | $result = array('height' => $height,
|
950 |
| - 'width' => $width); |
| 950 | + 'width' => $width); |
951 | 951 | return $result;
|
952 | 952 | }
|
953 | 953 |
|
954 | 954 | // camel naming conventions
|
955 |
| - function getAllAttributes() {return array_map('html_entity_decode', $this->attr);} |
956 |
| - function getAttribute($name) {return html_entity_decode($this->__get($name));} |
| 955 | + function getAllAttributes() {return $this->attr;} |
| 956 | + function getAttribute($name) {return $this->__get($name);} |
957 | 957 | function setAttribute($name, $value) {$this->__set($name, $value);}
|
958 | 958 | function hasAttribute($name) {return $this->__isset($name);}
|
959 | 959 | function removeAttribute($name) {$this->__set($name, null);}
|
@@ -1023,7 +1023,7 @@ class simple_html_dom
|
1023 | 1023 | 'p'=>array('p'=>1),
|
1024 | 1024 | 'nobr'=>array('nobr'=>1),
|
1025 | 1025 | 'b'=>array('b'=>1),
|
1026 |
| - 'option'=>array('option'=>1), |
| 1026 | + 'option'=>array('option'=>1), |
1027 | 1027 | );
|
1028 | 1028 |
|
1029 | 1029 | function __construct($str=null, $lowercase=true, $forceTagsClosed=true, $target_charset=DEFAULT_TARGET_CHARSET, $stripRN=true, $defaultBRText=DEFAULT_BR_TEXT, $defaultSpanText=DEFAULT_SPAN_TEXT)
|
@@ -1245,7 +1245,7 @@ protected function parse_charset()
|
1245 | 1245 | if (empty($charset))
|
1246 | 1246 | {
|
1247 | 1247 | // Have php try to detect the encoding from the text given to us.
|
1248 |
| - $charset = (function_exists('mb_detect_encoding')) ? mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ) : false; |
| 1248 | + $charset = mb_detect_encoding($this->root->plaintext . "ascii", $encoding_list = array( "UTF-8", "CP1252" ) ); |
1249 | 1249 | if (is_object($debugObject)) {$debugObject->debugLog(2, 'mb_detect found: ' . $charset);}
|
1250 | 1250 |
|
1251 | 1251 | // and if this doesn't work... then we need to just wrongheadedly assume it's UTF-8 so that we can move on - cause this will usually give us most of what we need...
|
@@ -1375,7 +1375,7 @@ protected function read_tag()
|
1375 | 1375 | return true;
|
1376 | 1376 | }
|
1377 | 1377 |
|
1378 |
| - if (!preg_match("/^[\w-:]+$/", $tag)) { |
| 1378 | + if (!preg_match("/^[\w\-:]+$/", $tag)) { |
1379 | 1379 | $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until('<>');
|
1380 | 1380 | if ($this->char==='<') {
|
1381 | 1381 | $this->link_nodes($node, false);
|
|
0 commit comments