Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,19 @@ export interface BasicCrawlerOptions<
* the Proxy URLs provided and rotated according to the configuration.
*/
proxyConfiguration?: ProxyConfiguration;

/**
* A unique identifier for the crawler instance. This ID is used to isolate the state returned by
* {@apilink BasicCrawler.useState|`crawler.useState()`} from other crawler instances.
*
* When multiple crawler instances use `useState()` without an explicit `id`, they will share the same
* state object for backward compatibility. A warning will be logged in this case.
*
* To ensure each crawler has its own isolated state that also persists across script restarts
* (e.g., during Apify migrations), provide a stable, unique ID for each crawler instance.
*
*/
id?: string;
}

/**
Expand Down Expand Up @@ -476,6 +489,12 @@ export class BasicCrawler<
> {
protected static readonly CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';

/**
* Tracks crawler instances that accessed shared state without having an explicit id.
* Used to detect and warn about multiple crawlers sharing the same state.
*/
private static useStateCrawlerIds = new Set<string>();

/**
* A reference to the underlying {@apilink Statistics} class that collects and logs run statistics for requests.
*/
Expand Down Expand Up @@ -569,6 +588,8 @@ export class BasicCrawler<
private experiments: CrawlerExperiments;
private readonly robotsTxtFileCache: LruCache<RobotsTxtFile>;
private _experimentWarnings: Partial<Record<keyof CrawlerExperiments, boolean>> = {};
private readonly crawlerId: string;
private readonly hasExplicitId: boolean;

protected static optionsShape = {
contextPipelineBuilder: ow.optional.object,
Expand Down Expand Up @@ -612,6 +633,8 @@ export class BasicCrawler<
experiments: ow.optional.object,

statisticsOptions: ow.optional.object,

id: ow.optional.string,
};

/**
Expand Down Expand Up @@ -659,8 +682,15 @@ export class BasicCrawler<
// internal
log = defaultLog.child({ prefix: this.constructor.name }),
experiments = {},

id,
} = options;

// Store whether the user explicitly provided an ID
this.hasExplicitId = id !== undefined;
// Store the user-provided ID, or generate a unique one for tracking purposes (not for state key)
this.crawlerId = id ?? cryptoRandomObjectId();

// Store the builder so that it can be run when the contextPipeline is needed.
// Invoking it immediately would cause problems with parent constructor call order.
this.contextPipelineBuilder = () => {
Expand Down Expand Up @@ -754,6 +784,7 @@ export class BasicCrawler<
logMessage: `${log.getOptions().prefix} request statistics:`,
log,
config,
...(this.hasExplicitId ? { id: this.crawlerId } : {}),
...statisticsOptions,
});
this.sessionPoolOptions = {
Expand Down Expand Up @@ -1098,6 +1129,23 @@ export class BasicCrawler<

async useState<State extends Dictionary = Dictionary>(defaultValue = {} as State): Promise<State> {
const kvs = await KeyValueStore.open(null, { config: this.config });

if (this.hasExplicitId) {
const stateKey = `${BasicCrawler.CRAWLEE_STATE_KEY}_${this.crawlerId}`;
return kvs.getAutoSavedValue<State>(stateKey, defaultValue);
}

BasicCrawler.useStateCrawlerIds.add(this.crawlerId);

if (BasicCrawler.useStateCrawlerIds.size > 1) {
defaultLog.warningOnce(
'Multiple crawler instances are calling useState() without an explicit `id` option. \n' +
'This means they will share the same state object, which is likely unintended. \n' +
'To fix this, provide a unique `id` option to each crawler instance. \n' +
'Example: new BasicCrawler({ id: "my-crawler-1", ... })',
);
}

return kvs.getAutoSavedValue<State>(BasicCrawler.CRAWLEE_STATE_KEY, defaultValue);
}

Expand Down
21 changes: 18 additions & 3 deletions packages/core/src/crawlers/statistics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ export class Statistics {
/**
* Statistic instance id.
*/
readonly id = Statistics.id++; // assign an id while incrementing so it can be saved/restored from KV
readonly id: string;

/**
* Current statistic state used for doing calculations on {@apilink Statistics.calculate} calls
Expand All @@ -90,7 +90,7 @@ export class Statistics {
private readonly config: Configuration;

protected keyValueStore?: KeyValueStore = undefined;
protected persistStateKey = `SDK_CRAWLER_STATISTICS_${this.id}`;
protected persistStateKey: string;
private logIntervalMillis: number;
private logMessage: string;
private listener: () => Promise<void>;
Expand All @@ -115,6 +115,7 @@ export class Statistics {
config: ow.optional.object,
persistenceOptions: ow.optional.object,
saveErrorSnapshots: ow.optional.boolean,
id: ow.optional.any(ow.number, ow.string),
}),
);

Expand All @@ -127,8 +128,12 @@ export class Statistics {
enable: true,
},
saveErrorSnapshots = false,
id,
} = options;

this.id = id ?? String(Statistics.id++);
this.persistStateKey = `SDK_CRAWLER_STATISTICS_${this.id}`;

this.log = (options.log ?? defaultLog).child({ prefix: 'Statistics' });
this.errorTracker = new ErrorTracker({ ...errorTrackerConfig, saveErrorSnapshots });
this.errorTrackerRetry = new ErrorTracker({ ...errorTrackerConfig, saveErrorSnapshots });
Expand Down Expand Up @@ -474,14 +479,24 @@ export interface StatisticsOptions {
* @default false
*/
saveErrorSnapshots?: boolean;

/**
* A unique identifier for this statistics instance. This ID is used for persistence
* to the key value store, ensuring the same statistics can be loaded after script restarts.
*
* If not provided, an auto-incremented ID will be used for backward compatibility.
* This means statistics may not persist correctly across script restarts
* if crawler creation order changes.
*/
id?: string;
}

/**
* Format of the persisted stats
*/
export interface StatisticPersistedState extends Omit<StatisticState, 'statsPersistedAt'> {
requestRetryHistogram: number[];
statsId: number;
statsId: string;
requestAvgFailedDurationMillis: number;
requestAvgFinishedDurationMillis: number;
requestTotalDurationMillis: number;
Expand Down
57 changes: 56 additions & 1 deletion test/core/crawlers/basic_crawler.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import express from 'express';
import { MemoryStorageEmulator } from 'test/shared/MemoryStorageEmulator.js';
import type { SetRequired } from 'type-fest';
import type { Mock } from 'vitest';
import { afterAll, beforeAll, beforeEach, describe, expect, test } from 'vitest';
import { afterAll, beforeAll, beforeEach, describe, expect, test, vitest } from 'vitest';

import log from '@apify/log';

Expand Down Expand Up @@ -414,6 +414,61 @@ describe('BasicCrawler', () => {
expect(await requestList.isEmpty()).toBe(true);
});

test('print a warning on sharing state between two crawlers', async () => {
function createCrawler() {
return new BasicCrawler({
requestHandler: async ({ request, useState }) => {
const state = await useState<{ urls: string[] }>({ urls: [] });
state.urls.push(request.url);
},
});
}

const loggerSpy = vitest.spyOn(log, 'warning');

const [crawler1, crawler2] = [createCrawler(), createCrawler()];

await crawler1.run([`http://${HOSTNAME}:${port}/`]);
await crawler2.run([`http://${HOSTNAME}:${port}/?page=2`]);

// Both crawlers should share the same state (backward compatibility)
const state1 = await crawler1.useState<{ urls: string[] }>();
const state2 = await crawler2.useState<{ urls: string[] }>();

expect(state1).toBe(state2);
expect(state1.urls).toHaveLength(2);
expect(state1.urls).toContain(`http://${HOSTNAME}:${port}/`);
expect(state1.urls).toContain(`http://${HOSTNAME}:${port}/?page=2`);
expect(loggerSpy).toBeCalledWith(expect.stringContaining('Multiple crawler instances are calling useState()'));
});

test('crawlers with explicit id have isolated state', async () => {
function createCrawler(id: string) {
return new BasicCrawler({
id,
requestHandler: async ({ request, useState }) => {
const state = await useState<{ urls: string[] }>({ urls: [] });
state.urls.push(request.url);
},
});
}

const [crawler1, crawler2] = [createCrawler('crawler-1'), createCrawler('crawler-2')];

await crawler1.run([`http://${HOSTNAME}:${port}/`]);
await crawler2.run([`http://${HOSTNAME}:${port}/?page=2`]);

// Each crawler should have its own isolated state
const state1 = await crawler1.useState<{ urls: string[] }>();
const state2 = await crawler2.useState<{ urls: string[] }>();

expect(state1).not.toBe(state2);
expect(state1.urls).toHaveLength(1);
expect(state1.urls).toContain(`http://${HOSTNAME}:${port}/`);
expect(state2.urls).toHaveLength(1);
expect(state2.urls).toContain(`http://${HOSTNAME}:${port}/?page=2`);
});

test.each([EventType.MIGRATING, EventType.ABORTING])(
'should pause on %s event and persist RequestList state',
async (event) => {
Expand Down
44 changes: 41 additions & 3 deletions test/core/crawlers/statistics.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ describe('Statistics', () => {
describe('persist state', () => {
// needs to go first for predictability
test('should increment id by each new consecutive instance', () => {
expect(stats.id).toEqual(0);
expect(stats.id).toEqual('0');
// @ts-expect-error Accessing private prop
expect(Statistics.id).toEqual(1);
// @ts-expect-error Accessing private prop
expect(stats.persistStateKey).toEqual('SDK_CRAWLER_STATISTICS_0');
const [n1, n2] = [new Statistics(), new Statistics()];
expect(n1.id).toEqual(1);
expect(n2.id).toEqual(2);
expect(n1.id).toEqual('1');
expect(n2.id).toEqual('2');
// @ts-expect-error Accessing private prop
expect(Statistics.id).toEqual(3);
});
Expand Down Expand Up @@ -338,4 +338,42 @@ describe('Statistics', () => {
expect(stats.state.requestsFinished).toEqual(0);
expect(stats.requestRetryHistogram).toEqual([]);
});

describe('explicit id option', () => {
test('statistics with same explicit id should share persisted state', async () => {
const stats1 = new Statistics({ id: 'shared-stats' });
stats1.startJob(0);
vitest.advanceTimersByTime(100);
stats1.finishJob(0, 0);

await stats1.startCapturing();
await stats1.persistState();
await stats1.stopCapturing();

const stats2 = new Statistics({ id: 'shared-stats' });
await stats2.startCapturing();

expect(stats2.state.requestsFinished).toEqual(1);

await stats2.stopCapturing();
});

test('statistics with different explicit ids should have isolated state', async () => {
const statsA = new Statistics({ id: 'stats-a' });
statsA.startJob(0);
vitest.advanceTimersByTime(100);
statsA.finishJob(0, 0);

await statsA.startCapturing();
await statsA.persistState();
await statsA.stopCapturing();

const statsB = new Statistics({ id: 'stats-b' });
await statsB.startCapturing();

expect(statsB.state.requestsFinished).toEqual(0);

await statsB.stopCapturing();
});
});
});