Skip to content

Commit

Permalink
Catch unhandled Browsershot exceptions in crawlFailed (#469)
Browse files Browse the repository at this point in the history
* Add error handling around Browsershot call and send exception to crawlFailed

* Add tests

* Remove erroneous `only`

* Respect request delay

* Revert "Respect request delay"

This reverts commit ef3115f.

* Respect request delay

---------

Co-authored-by: David Racovan <david@rarebirdinc.com>
  • Loading branch information
superpenguin612 and David Racovan authored Jul 31, 2024
1 parent f68bdb0 commit 099ea77
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 1 deletion.
16 changes: 15 additions & 1 deletion src/Handlers/CrawlRequestFulfilled.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
use Spatie\Crawler\CrawlUrl;
use Spatie\Crawler\ResponseWithCachedBody;
use Spatie\Crawler\UrlParsers\UrlParser;
use Symfony\Component\Process\Exception\ProcessFailedException;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Exception\RequestException;

class CrawlRequestFulfilled
{
Expand All @@ -39,7 +42,18 @@ public function __invoke(ResponseInterface $response, $index)
$crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);

if ($this->crawler->mayExecuteJavaScript()) {
$body = $this->getBodyAfterExecutingJavaScript($crawlUrl->url);
try {
$body = $this->getBodyAfterExecutingJavaScript($crawlUrl->url);
} catch (ProcessFailedException $exception) {
$request = new Request("GET", $crawlUrl->url);
$exception = new RequestException($exception->getMessage(), $request);
$crawlUrl = $this->crawler->getCrawlQueue()->getUrlById($index);

$this->crawler->getCrawlObservers()->crawlFailed($crawlUrl, $exception);

usleep($this->crawler->getDelayBetweenRequests());
return;
}

$response = $response->withBody(Utils::streamFor($body));
}
Expand Down
19 changes: 19 additions & 0 deletions tests/CrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
use Spatie\Crawler\Test\TestClasses\CrawlLogger;
use Spatie\Crawler\Test\TestClasses\Log;
use stdClass;
use Symfony\Component\Process\Exception\ProcessFailedException;

beforeEach(function () {
skipIfTestServerIsNotRunning();
Expand Down Expand Up @@ -117,6 +118,24 @@
expect(javascriptInjectedUrls())->each->notToBeCrawled();
});

it('fails gracefully when browsershot fails', function () {
expect(function () {
$browsershot = (new Browsershot)->waitUntilNetworkIdle();

Crawler::create([
RequestOptions::CONNECT_TIMEOUT => 60,
RequestOptions::TIMEOUT => 60,
RequestOptions::READ_TIMEOUT => 60,
])
->setBrowsershot($browsershot)
->executeJavaScript()
->setCrawlObserver(new CrawlLogger())
->startCrawling('http://localhost:8080/simulate-activity');
})->not->toThrow(ProcessFailedException::class);

expect(['url' => 'http://localhost:8080/simulate-activity'])->toBeCrawledOnce();
});

it('uses a crawl profile to determine what should be crawled', function () {
$crawlProfile = new class() extends CrawlProfile
{
Expand Down
32 changes: 32 additions & 0 deletions tests/server/server.js
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,38 @@ app.get('/sitemap2.xml', function (req, res) {
res.end(sitemap2);
});

// Route that initiates but never completes the response
app.get('/never-complete', (req, res) => {
req.socket.setTimeout(0); // Disable automatic socket timeout
res.writeHead(200, { 'Content-Type': 'text/plain' });
res.write('Starting but never completing...\n');
// Intentionally do not call res.end() or send more data, leaving the response hanging
});

app.get('/simulate-activity', (req, res) => {
res.send(`
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Simulated Network Activity</title>
</head>
<body>
<h1>This page simulates a never-ending network request</h1>
<script>
function keepBusy() {
fetch('/never-complete')
}
keepBusy();
setInterval(keepBusy, 1000);
</script>
</body>
</html>
`);
});


let server = app.listen(8080, function () {
const host = 'localhost';
Expand Down

0 comments on commit 099ea77

Please sign in to comment.