apify · barjin · Jan 20, 2025 · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025
diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts
@@ -294,7 +294,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
     /**
      * Allows to keep the crawler alive even if the {@apilink RequestQueue} gets empty.
      * By default, the `crawler.run()` will resolve once the queue is empty. With `keepAlive: true` it will keep running,
-     * waiting for more requests to come. Use `crawler.teardown()` to exit the crawler.
+     * waiting for more requests to come. Use `crawler.stop()` to exit the crawler gracefully, or `crawler.teardown()` to stop it immediately.
      */
     keepAlive?: boolean;
 
@@ -977,6 +977,23 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
         return stats;
     }
 
+    /**
+     * Gracefully stops the current run of the crawler.
+     *
+     * All the tasks active at the time of calling this method will be allowed to finish.
+     */
+    stop(message = 'The crawler has been gracefully stopped.'): void {
+        // Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
+        this.autoscaledPool
+            ?.pause()
+            // Resolves the `autoscaledPool.run()` promise in the `BasicCrawler.run()` method. Since the pool is already paused, it resolves immediately and doesn't kill any tasks.
+            .then(async () => this.autoscaledPool?.abort())
+            .then(() => this.log.info(message))
+            .catch((err) => {
+                this.log.error('An error occurred when stopping the crawler:', err);
+            });
+    }
+
     async getRequestQueue() {
         if (!this.requestQueue && this.requestList) {
             this.log.warningOnce(

diff --git a/test/e2e/cheerio-stop-resume-ts/actor/.actor/actor.json b/test/e2e/cheerio-stop-resume-ts/actor/.actor/actor.json
@@ -0,0 +1,7 @@
+{
+	"actorSpecification": 1,
+	"name": "test-cheerio-stop-resume-ts",
+	"version": "0.0",
+	"buildTag": "latest",
+	"env": null
+}
diff --git a/test/e2e/cheerio-stop-resume-ts/actor/.eslintrc.json b/test/e2e/cheerio-stop-resume-ts/actor/.eslintrc.json
@@ -0,0 +1,8 @@
+{
+	"root": true,
+	"extends": "../../.eslintrc.json",
+	"parserOptions": {
+		"project": "./test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json",
+		"ecmaVersion": 2022
+	}
+}
diff --git a/test/e2e/cheerio-stop-resume-ts/actor/.gitignore b/test/e2e/cheerio-stop-resume-ts/actor/.gitignore
@@ -0,0 +1,11 @@
+.idea
+.DS_Store
+node_modules
+package-lock.json
+apify_storage
+crawlee_storage
+storage
+main.d.ts
+main.d.ts.map
+main.js
+main.js.map
diff --git a/test/e2e/cheerio-stop-resume-ts/actor/Dockerfile b/test/e2e/cheerio-stop-resume-ts/actor/Dockerfile
@@ -0,0 +1,28 @@
+# using multistage build, as we need dev deps to build the TS source code
+FROM apify/actor-node:20-beta AS builder
+
+# copy all files, install all dependencies (including dev deps) and build the project
+COPY . ./
+RUN npm install --include=dev \
+    && npm run build
+
+# create final image
+FROM apify/actor-node:20-beta
+# copy only necessary files
+COPY --from=builder /usr/src/app/packages ./packages
+COPY --from=builder /usr/src/app/package.json ./
+COPY --from=builder /usr/src/app/main.js ./
+
+# install only prod deps
+RUN npm --quiet set progress=false \
+    && npm install --only=prod --no-optional --no-audit \
+    && npm update --no-audit \
+    && echo "Installed NPM packages:" \
+    && (npm list --only=prod --no-optional --all || true) \
+    && echo "Node.js version:" \
+    && node --version \
+    && echo "NPM version:" \
+    && npm --version
+
+# run compiled code
+CMD npm run start:prod
diff --git a/test/e2e/cheerio-stop-resume-ts/actor/main.ts b/test/e2e/cheerio-stop-resume-ts/actor/main.ts
@@ -0,0 +1,31 @@
+import { CheerioCrawler, Dataset } from '@crawlee/cheerio';
+import { Actor } from 'apify';
+
+if (process.env.STORAGE_IMPLEMENTATION === 'LOCAL') {
+    // @ts-ignore
+    await Actor.init({ storage: new (await import('@apify/storage-local')).ApifyStorageLocal() });
+} else {
+    await Actor.init();
+}
+
+let requestCount = 0;
+
+const crawler = new CheerioCrawler();
+crawler.router.addDefaultHandler(async ({ $, enqueueLinks, request, log }) => {
+    const { url } = request;
+    await enqueueLinks({
+        globs: ['https://crawlee.dev/docs/**'],
+    });
+
+    const pageTitle = $('title').first().text();
+    log.info(`URL: ${url} TITLE: ${pageTitle}`);
+    await Dataset.pushData({ url, pageTitle });
+
+    if (requestCount++ > 10) crawler.stop();
+});
+
+await crawler.run(['https://crawlee.dev/docs/quick-start']);
+
+requestCount = 0;
+await crawler.run(['https://crawlee.dev/docs/quick-start'], { purgeRequestQueue: false });
+await Actor.exit({ exit: Actor.isAtHome() });
diff --git a/test/e2e/cheerio-stop-resume-ts/actor/package.json b/test/e2e/cheerio-stop-resume-ts/actor/package.json
@@ -0,0 +1,35 @@
+{
+    "name": "test-cheerio-stop-resume-ts",
+    "version": "0.0.1",
+    "description": "Crawler Stop-Resume Test - TypeScript",
+    "dependencies": {
+        "apify": "next",
+        "@apify/storage-local": "^2.1.3",
+        "@crawlee/basic": "file:./packages/basic-crawler",
+        "@crawlee/browser-pool": "file:./packages/browser-pool",
+        "@crawlee/http": "file:./packages/http-crawler",
+        "@crawlee/cheerio": "file:./packages/cheerio-crawler",
+        "@crawlee/core": "file:./packages/core",
+        "@crawlee/memory-storage": "file:./packages/memory-storage",
+        "@crawlee/types": "file:./packages/types",
+        "@crawlee/utils": "file:./packages/utils"
+    },
+    "overrides": {
+        "apify": {
+            "@crawlee/core": "file:./packages/core",
+            "@crawlee/types": "file:./packages/types",
+            "@crawlee/utils": "file:./packages/utils"
+        }
+    },
+    "devDependencies": {
+        "@apify/tsconfig": "^0.1.0",
+        "typescript": "^5.0.0"
+    },
+    "scripts": {
+        "start": "tsc && node main.js",
+        "start:prod": "node main.js",
+        "build": "tsc"
+    },
+    "type": "module",
+    "license": "ISC"
+}
diff --git a/test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json b/test/e2e/cheerio-stop-resume-ts/actor/tsconfig.json
@@ -0,0 +1,9 @@
+{
+	"extends": "@apify/tsconfig",
+	"compilerOptions": {
+		"module": "ES2022",
+		"target": "ES2022",
+		"lib": ["DOM"]
+	},
+	"include": ["./**/*.ts"]
+}
diff --git a/test/e2e/cheerio-stop-resume-ts/test.mjs b/test/e2e/cheerio-stop-resume-ts/test.mjs
@@ -0,0 +1,12 @@
+import { initialize, getActorTestDir, runActor, expect } from '../tools.mjs';
+
+const testActorDirname = getActorTestDir(import.meta.url);
+await initialize(testActorDirname);
+
+const { stats, datasetItems } = await runActor(testActorDirname);
+
+/// Some extra requests are expected (at most 10 extra for each run).
+await expect(stats.requestsFinished < 40, 'crawler.stop() works');
+
+const visitedUrls = new Set(datasetItems.map((x) => x.url));
+await expect(visitedUrls.size === datasetItems.length, 'stateful crawler.run({ purgeRQ: false }) works');