Skip to content

Commit

Permalink
Added waitPerPageCrawlTimeoutRange for a random range in milliseconds…
Browse files Browse the repository at this point in the history
… between page requests to help with rate limiting
  • Loading branch information
cpdata committed Dec 4, 2023
1 parent c6b6303 commit b427b25
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 0 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,15 @@ type Config = {
maxTokens?: number;
/** Optional - Maximum concurent parellel requets at a time */
maxConcurrency?: number;

/** Optional - waitPerPageCrawlTimeoutRange is a object containing a min and max each for the number of milliseconds to wait after each page crawl.
* Use waitPerPageCrawlTimeoutRange to handle rate limiting.
*/
waitPerPageCrawlTimeoutRange?: {
min: number,
max: number,
};

/** Optional - Boolean parameter to use PlayWright with displayed browser or headless ( default headless=True ). */
headless?: boolean;
};
Expand Down
1 change: 1 addition & 0 deletions config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ export const defaultConfig: Config = {
url: "https://www.builder.io/c/docs/developers",
match: "https://www.builder.io/c/docs/**",
maxPagesToCrawl: 50,
waitPerPageCrawlTimeoutRange: {min:1000, max:1000},
headless: true,
maxConcurrency: 1,
};
9 changes: 9 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,15 @@ export const configSchema = z.object({
* @example 5000
*/
maxTokens: z.number().int().positive().optional(),
/**
* **Optional:**
* Range for random number of milliseconds between **min** and **max** to wait after each page crawl
* @default {min:1000,max:1000}
* */
waitPerPageCrawlTimeoutRange: z.object({
min: z.number().int().nonnegative(),
max: z.number().int().nonnegative(),
}).optional(),
/**
* **Optional:**
* Headless mode
Expand Down
22 changes: 22 additions & 0 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) {
}

export async function crawl(config: Config) {

// Function to delay the next crawl
function delay(time: number) {
return new Promise(function(resolve) {
setTimeout(resolve, time)
});
}

configSchema.parse(config);

if (process.env.NO_CRAWL !== "true") {
Expand Down Expand Up @@ -109,6 +117,20 @@ export async function crawl(config: Config) {
globs:
typeof config.match === "string" ? [config.match] : config.match,
});
// Use waitPerPageCrawlTimeoutRange to handle rate limiting
if (config.waitPerPageCrawlTimeoutRange) {
// Create a random number between min and max
const randomTimeout = Math.floor(Math.random() * (config.waitPerPageCrawlTimeoutRange.max - config.waitPerPageCrawlTimeoutRange.min + 1) + config.waitPerPageCrawlTimeoutRange.min);
log.info(
`Waiting ${randomTimeout} milliseconds before next crawl to avoid rate limiting...`
);
// Wait for the random amount of time before crawling the next page
await delay(randomTimeout);
}else{
// Wait for 1 second before crawling the next page
await delay(1000);
}
},
maxConcurrency: config.maxConcurrency || 1 , // Set the max concurrency
maxRequestsPerCrawl: config.maxPagesToCrawl, // Set the max pages to crawl or set to 0 to scrape the full website.
headless: config.headless ?? true, // Set to false to see the browser in action
Expand Down

0 comments on commit b427b25

Please sign in to comment.