]*?>(.*?)<\/xmp\b[^>]*?>/si',
+ create_function('$matches', '
+ return "".str_replace("<", "<", $matches[1])."
";
+ '),
+ $responseBody
+ );
+ $responseBody = preg_replace_callback(
+ '/]*?>(.*)$/si',
+ create_function('$matches', '
+ return "".str_replace("<", "<", $matches[1])."
";
+ '),
+ $responseBody
+ );
+ /*
+ * Remove DTD declarations, wrongly placed comments etc.
+ * This must be done before removing DOCTYPE.
+ */
+ $responseBody = preg_replace('/]*?>/si', '', $responseBody);
+ /*
+ * XML and DOCTYPE declaration will be replaced.
+ */
+ $responseBody = preg_replace('/]*?>/si', '', $responseBody);
+ $responseBody = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $responseBody);
+ if (preg_match('/^\s*$/s', $responseBody)) {
+ require_once 'Diggin/Scraper/Adapter/Exception.php';
+ throw new Diggin_Scraper_Adapter_Exception('The entity body became empty after preprocessing.');
+ }
+
+ /*
+ * Restore CDATAs and comments.
+ */
+ for ($i = 0; $i < $this->backup_count; $i++) {
+ $responseBody = str_replace("", $this->backup[$i], $responseBody);
+ }
+
+ /*
+ * Use Tidy to format HTML if available.
+ * Otherwise, use HTMLParser class (is slower and consumes much memory).
+ */
+
+ /*
+ * Replace every '&' with '&'
+ * for XML parser not to break on non-predefined entities.
+ * So you may need to replace '&' with '&'
+ * to have the original HTML string from returned SimpleXML object.
+ *
+ * //@see
+ * And tidy, it will replace htmlspecialchars('>' '<') to ('<, '>'')
+ * if not as Html Tag for tidy.
+ * so, "str_replace('&')" before tidy.
+ */
+
+ if (extension_loaded('tidy')) {
+ if ($this->config['pre_ampersand_escape']) {
+ $responseBody = str_replace('&', '&', $responseBody);
+ }
+ $tidy = new tidy();
+ $tidy->parseString($responseBody, $this->config['tidy'], 'UTF8');
+ $tidy->cleanRepair();
+ $responseBody = $tidy->html();
+ } else {
+ if ($this->config['pre_ampersand_escape']) {
+ $responseBody = str_replace('&', '&', $responseBody);
+ }
+ //?
+ $responseBody = str_replace('&', '&', $responseBody);
+ require_once 'HTMLParser.class.php';
+ $parser = new HTMLParser;
+ $format_rule = require 'xhtml1-transitional_dtd.inc.php';
+ $parser->setRule($format_rule);
+ $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
+ $parser->setGenericParent('body');
+ $parser->parse($responseBody);
+ $responseBody = $parser->dump();
+ }
+ /*
+ * Valid XHTML DOCTYPE declaration (with DTD URI) is required
+ * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
+ */
+ $declarations = '';
+ $declarations .= '';
+
+ return "$declarations$responseBody";
+ }
+
+ /**
+ * backup (Html and Xml comment)
+ *
+ * @param array $matches
+ * @return string
+ */
+ private function backup($matches)
+ {
+ $this->backup[] = $matches[0];
+ $replace = "backup_count}\" />";
+ $this->backup_count++;
+
+ return $replace;
+ }
+
+
+ /**
+ * Set configuration parameters for this
+ *
+ * @param array $config
+ * @return Diggin_Scraper_Adapter_Htmlscraping
+ * @throws Diggin_Scraper_Adapter_Exception
+ */
+ public function setConfig($config = array())
+ {
+ if (!is_array($config)) {
+ require_once 'Diggin/Scraper/Adapter/Exception.php';
+ throw new Diggin_Scraper_Adapter_Exception('Expected array parameter, given ' . gettype($config));
+ }
+
+ if (isset($config['tidy']['output-xhtml']) && $config['tidy']['output-xhtml'] !== true) {
+ require_once 'Diggin/Scraper/Adapter/Exception.php';
+ throw new Diggin_Scraper_Adapter_Exception('tidy-config "output-xhtml" not as true - not allowed');
+ }
+
+ foreach ($config as $k => $v) {
+ $this->config[strtolower($k)] = $v;
+ }
+
+ return $this;
+ }
+
+ public function setCharsetFront(Diggin_Http_Response_Charset_Front_EncodeInterface $charseFront)
+ {
+ $this->_charsetFront = $charsetFront;
+ }
+
+ public function getCharsetFront()
+ {
+ if (!$this->_charsetFront) {
+ require_once 'Diggin/Http/Response/Charset/Front/UrlRegex.php';
+ $this->_charsetFront = new Diggin_Http_Response_Charset_Front_UrlRegex;
+ }
+
+ return $this->_charsetFront;
+ }
+
+}