Skip to content

Commit

Permalink
Allow maxPagesToCrawl to be optional and infinite by setting 0 which …
Browse files Browse the repository at this point in the history
…will display the infinity symbol. Default is 50
  • Loading branch information
cpdata committed Dec 4, 2023
1 parent 14eb9fa commit e700f6e
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 3 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ type Config = {
match: string;
/** Selector to grab the inner text from */
selector: string;
/** Don't crawl more than this many pages */

/** Optional - Don't crawl more than this many pages (0 = Crawl all, Default = 50)*/
maxPagesToCrawl: number;
/** File name for the finished data */
outputFileName: string;
Expand Down
3 changes: 2 additions & 1 deletion src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ export const configSchema = z.object({
*/
selector: z.string().optional(),
/**
* **Optional:**
* Don't crawl more than this many pages
* @default 50
*/
maxPagesToCrawl: z.number().int().positive(),
maxPagesToCrawl: z.number().int().nonnegative().or(z.undefined()).optional(),
/**
* File name for the finished data
* @default "output.json"
Expand Down
10 changes: 9 additions & 1 deletion src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,12 @@ export async function crawl(config: Config) {
const crawler = new PlaywrightCrawler({
// Use the requestHandler to process each of the crawled pages.
async requestHandler({ request, page, enqueueLinks, log, pushData }) {
// Warn if unlimited crawling is enabled
if (config.maxPagesToCrawl == 0) {
config.maxPagesToCrawl = undefined;
log.warningOnce(`maxPagesToCrawl is set to ${config.maxPagesToCrawl} which means it will contine until it cannot find anymore links defined by match: ${config.match}`);
}

if (config.cookie) {
// Set the cookie for the specific URL
const cookie = {
Expand All @@ -66,9 +72,11 @@ export async function crawl(config: Config) {
}

const title = await page.title();
// Display the pageCounter/maxPagesToCrawl number or pageCounter/∞ if maxPagesToCrawl=0
const maxPagesToCrawlDisplay = config.maxPagesToCrawl == undefined ? "∞" : config.maxPagesToCrawl;
pageCounter++;
log.info(
`Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
`Crawling: Page ${pageCounter} / ${maxPagesToCrawlDisplay} - URL: ${request.loadedUrl}...`
);

// Use custom handling for XPath selector
Expand Down

0 comments on commit e700f6e

Please sign in to comment.