import { CheerioCrawler } from '@crawlee/cheerio';
const crawlerA = new CheerioCrawler({
requestHandler: () => console.log("AAAAA"),
});
await crawlerA.run(['https://crawlee.dev'])
const crawlerB = new CheerioCrawler({
requestHandler: () => console.log("BBBBB"),
});
await crawlerB.run(['https://crawlee.dev'], { purgeRequestQueue: true });
INFO CheerioCrawler: Starting the crawler.
AAAAA
INFO CheerioCrawler: All requests from the queue have been processed, the crawler will shut down.
INFO CheerioCrawler: Final request statistics: {"requestsFinished":1,"requestsFailed":0,"retryHistogram":[1],"requestAvgFailedDurationMillis":null,"requestAvgFinishedDurationMillis":728,"requestsFinishedPerMinute":71,"requestsFailedPerMinute":0,"requestTotalDurationMillis":728,"requestsTotal":1,"crawlerRuntimeMillis":844}
INFO CheerioCrawler: Finished! Total 1 requests: 1 succeeded, 0 failed. {"terminal":true}
INFO CheerioCrawler: Starting the crawler.
INFO CheerioCrawler: All requests from the queue have been processed, the crawler will shut down.
INFO CheerioCrawler: Final request statistics: {"requestsFinished":0,"requestsFailed":0,"retryHistogram":[],"requestAvgFailedDurationMillis":null,"requestAvgFinishedDurationMillis":null,"requestsFinishedPerMinute":0,"requestsFailedPerMinute":0,"requestTotalDurationMillis":0,"requestsTotal":0,"crawlerRuntimeMillis":69}
INFO CheerioCrawler: Finished! Total 0 requests: 0 succeeded, 0 failed. {"terminal":true}
The following snippet will only log
AAAAA, as the second crawler won't run (the default RQ contains the handled first request only after the execution):This behaviour is not documented very well and IMO is wrong given the param values.
tested only locally (with
@crawlee/memory-storage).