Now archived due to fundamental issues. Replaced by SuperSimpleCrawler
composer require brittainmedia/phpcrawl
use PHPCrawl\Enums\PHPCrawlerAbortReasons;
use PHPCrawl\Enums\PHPCrawlerMultiProcessModes;
use PHPCrawl\Enums\PHPCrawlerUrlCacheTypes;
use PHPCrawl\PHPCrawler;
use PHPCrawl\PHPCrawlerDocumentInfo;
// New custom crawler
$crawler = new class() extends PHPCrawler {
/**
* @param $PageInfo
* @return int
*/
function handleDocumentInfo($PageInfo): int
{
// Print the URL of the document
echo "URL: " . $PageInfo->url . PHP_EOL;
// Print the http-status-code
echo "HTTP-statuscode: " . $PageInfo->http_status_code . PHP_EOL;
// Print the number of found links in this document
echo "Links found: " . count($PageInfo->links_found_url_descriptors) . PHP_EOL;
// ..
// continue crawling
return 1;
}
};
$crawler->setURL($url = 'https://bbc.co.uk/news');
// Optional
//$crawler->setProxy($proxy_host, $proxy_port, $proxy_username, $proxy_password);
// Only receive content of files with content-type "text/html"
$crawler->addContentTypeReceiveRule('#text/html#');
// Ignore links to ads...
$advertFilterRule = "/\bads\b|2o7|a1\.yimg|ad(brite|click|farm|revolver|server|tech|vert)|at(dmt|wola)|banner|bizrate|blogads|bluestreak|burstnet|casalemedia|coremetrics|(double|fast)click|falkag|(feedster|right)media|googlesyndication|hitbox|httpads|imiclk|intellitxt|js\.overture|kanoodle|kontera|mediaplex|nextag|pointroll|qksrv|speedera|statcounter|tribalfusion|webtrends/";
$crawler->addURLFilterRule($advertFilterRule);
// Store and send cookie-data like a browser does
$crawler->enableCookieHandling(true);
// Limits set, successfully retrieved only
$crawler->setRequestLimit(1);
/**
* 3 - The crawler only follows links to pages or files located in or under the same path like the one of the root-url.</b>
* E.g. if the root-url is
* "http://www.foo.com/bar/index.html",
* the crawler will follow links to "http://www.foo.com/bar/page.html" and "http://www.foo.com/bar/path/index.html",
* but not links to "http://www.foo.com/page.html".
*
*/
$crawler->setFollowMode(3);
// Keep going until resolved
$crawler->setFollowRedirectsTillContent(TRUE);
// tmp directory
$crawler->setWorkingDirectory(sys_get_temp_dir() . DIRECTORY_SEPARATOR . 'phpcrawl' .DIRECTORY_SEPARATOR);
// Cache
$crawler->setUrlCacheType(PHPCrawlerUrlCacheTypes::URLCACHE_MEMORY);
// File crawling - Store to file or set limit for large files
#$crawler->addStreamToFileContentType('##');
#$crawler->setContentSizeLimit(500000); // Google only crawls pages 500kb and below?
//Decides whether the crawler should obey "nofollow"-tags, we will obey
$crawler->obeyNoFollowTags(true);
//Decides whether the crawler should obey robot.txt, we will not obey!
$crawler->obeyRobotsTxt(false);
// Delay to stop blocking
$crawler->setRequestDelay(0.5);
// fake browser or use fake robot one
$crawler->setUserAgentString('Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0');
// Multiprocess (optional) - Forces PHPCrawlerUrlCacheTypes::URLCACHE_SQLITE use, must have link priorities!
$crawler->addLinkPriority("/news/", 10);
$crawler->addLinkPriority("/\.jpeg/", 5);
$crawler->goMultiProcessed(PHPCrawlerMultiProcessModes::MPMODE_CHILDS_EXECUTES_USERCODE);
// Thats enough, now here we go
$crawler->go();
// At the end, after the process is finished, we print a short
// report (see method getProcessReport() for more information)
$report = $crawler->getProcessReport();
echo 'Finished crawling site: ' . $url . PHP_EOL;
echo 'Summary:' . PHP_EOL;
echo 'Links followed: ' . $report->links_followed . PHP_EOL;
echo 'Documents received: ' . $report->files_received . PHP_EOL;
echo 'Bytes received: ' . $report->bytes_received . ' bytes' . PHP_EOL;
echo 'Process runtime: ' . $report->process_runtime . ' sec' . PHP_EOL;
echo 'Process memory: ' . $report->memory_peak_usage . ' sec' . PHP_EOL;
echo 'Server connect time: ' . $report->avg_server_connect_time . ' sec' . PHP_EOL;
echo 'Server response time: ' . $report->avg_server_response_time . ' sec' . PHP_EOL;
echo 'Server transfer rate: ' . $report->avg_proc_data_transfer_rate . ' bytes' . PHP_EOL;
$abortReason = $report->abort_reason;
switch ($abortReason) {
case PHPCrawlerAbortReasons::ABORTREASON_PASSEDTHROUGH:
echo 'Crawling-process aborted because everything is done/passed through.' . PHP_EOL;
break;
case PHPCrawlerAbortReasons::ABORTREASON_TRAFFICLIMIT_REACHED:
echo 'Crawling-process aborted because the traffic limit set by user was reached.' . PHP_EOL;
break;
case PHPCrawlerAbortReasons::ABORTREASON_FILELIMIT_REACHED:
echo 'Crawling-process aborted because the file limit set by user was reached.' . PHP_EOL;
break;
case PHPCrawlerAbortReasons::ABORTREASON_USERABORT:
echo 'Crawling-process aborted because the handleDocumentInfo-method returned a negative value.' . PHP_EOL;
break;
default:
echo 'Unknown abort reason.' . PHP_EOL;
break;
}
Initially just a copy of http://phpcrawl.cuab.de/ forked from mmerian for using with composer.
Due to the main project now seemingly being abandoned (having no updates for 4 years) I am going to proceed to make any changes/fixes in this repository.
- 0.9 compatible PHP 7 Only.
- 0.10 compatible PHP 8. (Submit issues)
- Introduced namespaces
- Lots of bug fixes
- Refactored various class sections
Now archived...