@@ -3,9 +3,9 @@ import { load } from 'cheerio';
33import { CheerioCrawlingContext , htmlToText , log , PlaywrightCrawlingContext , sleep , Request } from 'crawlee' ;
44
55import { ContentCrawlerStatus , ContentCrawlerTypes } from './const.js' ;
6- import { addResultToResponse , sendResponseIfFinished } from './responses.js' ;
6+ import { addResultToResponse , responseData , sendResponseIfFinished } from './responses.js' ;
77import { Output , ContentCrawlerUserData } from './types.js' ;
8- import { addTimeMeasureEvent , transformTimeMeasuresToRelative } from './utils.js' ;
8+ import { addTimeMeasureEvent , isActorStandby , transformTimeMeasuresToRelative } from './utils.js' ;
99import { processHtml } from './website-content-crawler/html-processing.js' ;
1010import { htmlToMarkdown } from './website-content-crawler/markdown.js' ;
1111
@@ -27,6 +27,22 @@ async function waitForPlaywright({ page }: PlaywrightCrawlingContext, time: numb
2727 return Promise . race ( [ page . waitForLoadState ( 'networkidle' , { timeout : 0 } ) , sleep ( time - hardDelay ) ] ) ;
2828}
2929
30+ /**
31+ * Checks if the request should time out based on response timeout.
32+ * It verifies if the response data contains the responseId. If not, it sets the request's noRetry flag
33+ * to true and throws an error to cancel the request.
34+ *
35+ * @param {Request } request - The request object to be checked.
36+ * @param {string } responseId - The response ID to look for in the response data.
37+ * @throws {Error } Throws an error if the request times out.
38+ */
39+ function checkTimeoutAndCancelRequest ( request : Request , responseId : string ) {
40+ if ( ! responseData . has ( responseId ) ) {
41+ request . noRetry = true ;
42+ throw new Error ( 'Timed out. Cancelling the request...' ) ;
43+ }
44+ }
45+
3046/**
3147 * Decide whether to wait based on the remaining time left for the Actor to run.
3248 * Always waits if the Actor is in the STANDBY_MODE.
@@ -148,7 +164,9 @@ export async function requestHandlerPlaywright(
148164 context : PlaywrightCrawlingContext < ContentCrawlerUserData > ,
149165) {
150166 const { request, response, page, closeCookieModals } = context ;
151- const { contentScraperSettings : settings } = request . userData ;
167+ const { contentScraperSettings : settings , responseId } = request . userData ;
168+
169+ if ( isActorStandby ( ) ) checkTimeoutAndCancelRequest ( request , responseId ) ;
152170
153171 log . info ( `Processing URL: ${ request . url } ` ) ;
154172 addTimeMeasureEvent ( request . userData , 'playwright-request-start' ) ;
@@ -180,6 +198,9 @@ export async function requestHandlerCheerio(
180198 context : CheerioCrawlingContext < ContentCrawlerUserData > ,
181199) {
182200 const { $, request, response } = context ;
201+ const { responseId } = request . userData ;
202+
203+ if ( isActorStandby ( ) ) checkTimeoutAndCancelRequest ( request , responseId ) ;
183204
184205 log . info ( `Processing URL: ${ request . url } ` ) ;
185206 addTimeMeasureEvent ( request . userData , 'cheerio-request-start' ) ;
0 commit comments