diff --git a/Diggin_Scraper_Adapter_Htmlscraping/tags/release-0.4.8-20101223194005/Diggin/Scraper/Adapter/Htmlscraping.php b/Diggin_Scraper_Adapter_Htmlscraping/tags/release-0.4.8-20101223194005/Diggin/Scraper/Adapter/Htmlscraping.php new file mode 100644 index 00000000..a67965b8 --- /dev/null +++ b/Diggin_Scraper_Adapter_Htmlscraping/tags/release-0.4.8-20101223194005/Diggin/Scraper/Adapter/Htmlscraping.php @@ -0,0 +1,316 @@ + array('output-xhtml' => true, + 'wrap' => 0, + /*'wrap-script-literals' => true*/), + 'pre_ampersand_escape' => false, + 'url' => null + ); + + /** + * @var Diggin_Http_Response_Charset_Front_EncodeInterface + */ + private $_charsetFront; + + /** + * @var array + */ + private $backup = array(); + + /** + * @var integer + */ + private $backup_count = 0; + + /** + * Casts a SimpleXMLElement + * + * @param Zend_Http_Response $response + * @return SimpleXMLElement + */ + public function getSimplexml($response) + { + try { + $this->setConfig(array('pre_ampersand_escape' => true)); + $xhtml = $this->getXhtml($response); + } catch (Exception $e) { + require_once 'Diggin/Scraper/Adapter/Exception.php'; + throw new Diggin_Scraper_Adapter_Exception($e); + } + + /* + * Remove default namespace. + * This is because that SimpleXMLElement->registerXPathNamespace() may cause + * a problem under some circumstances (confirmed with PHP 5.1.6 so far). + * So you do not need to use SimpleXMLElement->registerXPathNamespace() + * when you use SimpleXMLElement->xpath(). + */ + //origin is + //$responseBody = preg_replace('/\sxmlns="[^"]+"/', '', $xhtml); + + $responseBody = preg_replace(array('/\sxmlns:?[A-Za-z]*="[^"]+"/', "/\sxmlns:?[A-Za-z]*='[^']+'/"), '', $xhtml); + + try { + /** Diggin_Scraper_Adapter_Wrapper_SimpleXMLElement */ + require_once 'Diggin/Scraper/Adapter/Wrapper/SimpleXMLElement.php'; + //@see http://php.net/libxml.constants + if (isset($this->config['libxmloptions'])) { + $xml_object = @new Diggin_Scraper_Adapter_Wrapper_SimpleXMLElement($responseBody, $this->config['libxmloptions']); + } else { + $xml_object = @new Diggin_Scraper_Adapter_Wrapper_SimpleXMLElement($responseBody); + } + } catch (Exception $e) { + require_once 'Diggin/Scraper/Adapter/Exception.php'; + throw new Diggin_Scraper_Adapter_Exception($e); + } + + return $xml_object; + } + + /** + * Return array contains formated XHTML string + * created from the responded HTML of the given URL. + * array[code] => HTTP status code + * array[headers] => HTTP headers + * array[headers] => formated XHTML string made from the entity body + * Throw exception if error. + * + * @param string $url + * @param string $responseBody + * @return string + * @throws Diggin_Scraper_Adapter_Exception + */ + final public function getXhtml($response) + { + // convert to UTF-8 + $document = array('url' => $this->config['url'], + 'content' => array('body' => $response->getBody(), 'content-type' => $response->getHeader('content-type'))); + list($responseBody, $this->backup) = $this->getCharsetFront()->convert($document, $this->backup); + + /* + * Initialize the backups. + */ + $this->backup = array(); + $this->backup_count = 0; + /* + * Removing SCRIPT and STYLE is recommended. + * The following substitute code will capsulate the content of the tags in CDATA. + * If use it, be sure that some JavaScript method such as document.write + * is not compliant with XHTML/XML. + */ + $tags = array('script', 'style'); + foreach ($tags as $tag) { + $responseBody = preg_replace("/<$tag\b[^>]*?>.*?<\/$tag\b[^>]*?>/si", '' , $responseBody); + } + /* + * Backup CDATA sections for later process. + */ + $responseBody = preg_replace_callback( + '//s', array($this, 'backup'), $responseBody + ); + /* + * Comment section must not contain two or more adjacent hyphens. + */ + $responseBody = preg_replace_callback( + '//si', + create_function('$matches', ' + return ""; + '), + $responseBody + ); + /* + * Backup comment sections for later process. + */ + $responseBody = preg_replace_callback( + '//s', array($this, 'backup'), $responseBody + ); + /* + * Process tags that is potentially dangerous for XML parsers. + */ + $responseBody = preg_replace_callback( + '/(]*?>)(.*?)(<\/textarea\b[^>]*?>)/si', + create_function('$matches', ' + return $matches[1].str_replace("<", "<", $matches[2]).$matches[3]; + '), + $responseBody + ); + $responseBody = preg_replace_callback( + '/]*?>(.*?)<\/xmp\b[^>]*?>/si', + create_function('$matches', ' + return "
".str_replace("<", "<", $matches[1])."
"; + '), + $responseBody + ); + $responseBody = preg_replace_callback( + '/]*?>(.*)$/si', + create_function('$matches', ' + return "
".str_replace("<", "<", $matches[1])."
"; + '), + $responseBody + ); + /* + * Remove DTD declarations, wrongly placed comments etc. + * This must be done before removing DOCTYPE. + */ + $responseBody = preg_replace('/]*?>/si', '', $responseBody); + /* + * XML and DOCTYPE declaration will be replaced. + */ + $responseBody = preg_replace('/]*?>/si', '', $responseBody); + $responseBody = preg_replace('/<\?xml\b[^>]*?\?>/si', '', $responseBody); + if (preg_match('/^\s*$/s', $responseBody)) { + require_once 'Diggin/Scraper/Adapter/Exception.php'; + throw new Diggin_Scraper_Adapter_Exception('The entity body became empty after preprocessing.'); + } + + /* + * Restore CDATAs and comments. + */ + for ($i = 0; $i < $this->backup_count; $i++) { + $responseBody = str_replace("", $this->backup[$i], $responseBody); + } + + /* + * Use Tidy to format HTML if available. + * Otherwise, use HTMLParser class (is slower and consumes much memory). + */ + + /* + * Replace every '&' with '&' + * for XML parser not to break on non-predefined entities. + * So you may need to replace '&' with '&' + * to have the original HTML string from returned SimpleXML object. + * + * //@see + * And tidy, it will replace htmlspecialchars('>' '<') to ('<, '>'') + * if not as Html Tag for tidy. + * so, "str_replace('&')" before tidy. + */ + + if (extension_loaded('tidy')) { + if ($this->config['pre_ampersand_escape']) { + $responseBody = str_replace('&', '&', $responseBody); + } + $tidy = new tidy(); + $tidy->parseString($responseBody, $this->config['tidy'], 'UTF8'); + $tidy->cleanRepair(); + $responseBody = $tidy->html(); + } else { + if ($this->config['pre_ampersand_escape']) { + $responseBody = str_replace('&', '&', $responseBody); + } + //? + $responseBody = str_replace('&', '&', $responseBody); + require_once 'HTMLParser.class.php'; + $parser = new HTMLParser; + $format_rule = require 'xhtml1-transitional_dtd.inc.php'; + $parser->setRule($format_rule); + $parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml')); + $parser->setGenericParent('body'); + $parser->parse($responseBody); + $responseBody = $parser->dump(); + } + /* + * Valid XHTML DOCTYPE declaration (with DTD URI) is required + * for SimpleXMLElement->asXML() method to produce proper XHTML tags. + */ + $declarations = ''; + $declarations .= ''; + + return "$declarations$responseBody"; + } + + /** + * backup (Html and Xml comment) + * + * @param array $matches + * @return string + */ + private function backup($matches) + { + $this->backup[] = $matches[0]; + $replace = "backup_count}\" />"; + $this->backup_count++; + + return $replace; + } + + + /** + * Set configuration parameters for this + * + * @param array $config + * @return Diggin_Scraper_Adapter_Htmlscraping + * @throws Diggin_Scraper_Adapter_Exception + */ + public function setConfig($config = array()) + { + if (!is_array($config)) { + require_once 'Diggin/Scraper/Adapter/Exception.php'; + throw new Diggin_Scraper_Adapter_Exception('Expected array parameter, given ' . gettype($config)); + } + + if (isset($config['tidy']['output-xhtml']) && $config['tidy']['output-xhtml'] !== true) { + require_once 'Diggin/Scraper/Adapter/Exception.php'; + throw new Diggin_Scraper_Adapter_Exception('tidy-config "output-xhtml" not as true - not allowed'); + } + + foreach ($config as $k => $v) { + $this->config[strtolower($k)] = $v; + } + + return $this; + } + + public function setCharsetFront(Diggin_Http_Response_Charset_Front_EncodeInterface $charseFront) + { + $this->_charsetFront = $charsetFront; + } + + public function getCharsetFront() + { + if (!$this->_charsetFront) { + require_once 'Diggin/Http/Response/Charset/Front/UrlRegex.php'; + $this->_charsetFront = new Diggin_Http_Response_Charset_Front_UrlRegex; + } + + return $this->_charsetFront; + } + +}