Skip to content

Commit 4cf4ea6

Browse files
committed
feat: add crawling endpoint
1 parent 46756ac commit 4cf4ea6

File tree

17 files changed

+1309
-2
lines changed

17 files changed

+1309
-2
lines changed

scrapegraph-js/README.md

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,105 @@ const prompt = 'What is the latest version of Python and what are its main featu
151151
})();
152152
```
153153

154+
### Crawl API
155+
156+
Start a crawl job to extract structured data from a website and its linked pages, using a custom schema.
157+
158+
```javascript
159+
import { crawl, getCrawlRequest } from 'scrapegraph-js';
160+
import 'dotenv/config';
161+
162+
const apiKey = process.env.SGAI_APIKEY;
163+
const url = 'https://scrapegraphai.com/';
164+
const prompt = 'What does the company do? and I need text content from there privacy and terms';
165+
166+
const schema = {
167+
"$schema": "http://json-schema.org/draft-07/schema#",
168+
"title": "ScrapeGraphAI Website Content",
169+
"type": "object",
170+
"properties": {
171+
"company": {
172+
"type": "object",
173+
"properties": {
174+
"name": { "type": "string" },
175+
"description": { "type": "string" },
176+
"features": { "type": "array", "items": { "type": "string" } },
177+
"contact_email": { "type": "string", "format": "email" },
178+
"social_links": {
179+
"type": "object",
180+
"properties": {
181+
"github": { "type": "string", "format": "uri" },
182+
"linkedin": { "type": "string", "format": "uri" },
183+
"twitter": { "type": "string", "format": "uri" }
184+
},
185+
"additionalProperties": false
186+
}
187+
},
188+
"required": ["name", "description"]
189+
},
190+
"services": {
191+
"type": "array",
192+
"items": {
193+
"type": "object",
194+
"properties": {
195+
"service_name": { "type": "string" },
196+
"description": { "type": "string" },
197+
"features": { "type": "array", "items": { "type": "string" } }
198+
},
199+
"required": ["service_name", "description"]
200+
}
201+
},
202+
"legal": {
203+
"type": "object",
204+
"properties": {
205+
"privacy_policy": { "type": "string" },
206+
"terms_of_service": { "type": "string" }
207+
},
208+
"required": ["privacy_policy", "terms_of_service"]
209+
}
210+
},
211+
"required": ["company", "services", "legal"]
212+
};
213+
214+
(async () => {
215+
try {
216+
// Start the crawl job
217+
const crawlResponse = await crawl(apiKey, url, prompt, schema, {
218+
cacheWebsite: true,
219+
depth: 2,
220+
maxPages: 2,
221+
sameDomainOnly: true,
222+
batchSize: 1,
223+
});
224+
console.log('Crawl job started. Response:', crawlResponse);
225+
226+
// If the crawl is asynchronous and returns an ID, fetch the result
227+
const crawlId = crawlResponse.id || crawlResponse.task_id;
228+
if (crawlId) {
229+
for (let i = 0; i < 10; i++) {
230+
await new Promise((resolve) => setTimeout(resolve, 5000));
231+
const result = await getCrawlRequest(apiKey, crawlId);
232+
if (result.status === 'success' && result.result) {
233+
console.log('Crawl completed. Result:', result.result.llm_result);
234+
break;
235+
} else if (result.status === 'failed') {
236+
console.log('Crawl failed. Result:', result);
237+
break;
238+
} else {
239+
console.log(`Status: ${result.status}, waiting...`);
240+
}
241+
}
242+
} else {
243+
console.log('No crawl ID found in response. Synchronous result:', crawlResponse);
244+
}
245+
} catch (error) {
246+
console.error('Error occurred:', error);
247+
}
248+
})();
249+
```
250+
251+
You can use a plain JSON schema or a [Zod](https://www.npmjs.com/package/zod) schema for the `schema` parameter. The crawl API supports options for crawl depth, max pages, domain restriction, and batch size.
252+
154253
### Scraping local HTML
155254

156255
Extract structured data from local HTML content
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
import { crawl, getCrawlRequest } from '../index.js';
2+
import 'dotenv/config';
3+
4+
// Example .env file:
5+
// SGAI_APIKEY=your_sgai_api_key
6+
7+
const apiKey = process.env.SGAI_APIKEY;
8+
9+
const schema = {
10+
"$schema": "http://json-schema.org/draft-07/schema#",
11+
"title": "ScrapeGraphAI Website Content",
12+
"type": "object",
13+
"properties": {
14+
"company": {
15+
"type": "object",
16+
"properties": {
17+
"name": { "type": "string" },
18+
"description": { "type": "string" },
19+
"features": { "type": "array", "items": { "type": "string" } },
20+
"contact_email": { "type": "string", "format": "email" },
21+
"social_links": {
22+
"type": "object",
23+
"properties": {
24+
"github": { "type": "string", "format": "uri" },
25+
"linkedin": { "type": "string", "format": "uri" },
26+
"twitter": { "type": "string", "format": "uri" }
27+
},
28+
"additionalProperties": false
29+
}
30+
},
31+
"required": ["name", "description"]
32+
},
33+
"services": {
34+
"type": "array",
35+
"items": {
36+
"type": "object",
37+
"properties": {
38+
"service_name": { "type": "string" },
39+
"description": { "type": "string" },
40+
"features": { "type": "array", "items": { "type": "string" } }
41+
},
42+
"required": ["service_name", "description"]
43+
}
44+
},
45+
"legal": {
46+
"type": "object",
47+
"properties": {
48+
"privacy_policy": { "type": "string" },
49+
"terms_of_service": { "type": "string" }
50+
},
51+
"required": ["privacy_policy", "terms_of_service"]
52+
}
53+
},
54+
"required": ["company", "services", "legal"]
55+
};
56+
57+
const url = 'https://scrapegraphai.com/';
58+
const prompt = 'What does the company do? and I need text content from there privacy and terms';
59+
60+
(async () => {
61+
if (!apiKey) {
62+
console.error('SGAI_APIKEY not found in environment. Please set it in your .env file.');
63+
process.exit(1);
64+
}
65+
66+
try {
67+
// Start the crawl job
68+
console.log(`\nStarting crawl for: ${url}`);
69+
const crawlResponse = await crawl(apiKey, url, prompt, schema, {
70+
cacheWebsite: true,
71+
depth: 2,
72+
maxPages: 2,
73+
sameDomainOnly: true,
74+
batchSize: 1,
75+
});
76+
console.log('\nCrawl job started. Response:');
77+
console.log(JSON.stringify(crawlResponse, null, 2));
78+
79+
// If the crawl is asynchronous and returns an ID, fetch the result
80+
const crawlId = crawlResponse.id || crawlResponse.task_id;
81+
if (crawlId) {
82+
console.log('\nPolling for crawl result...');
83+
for (let i = 0; i < 10; i++) {
84+
await new Promise((resolve) => setTimeout(resolve, 5000));
85+
const result = await getCrawlRequest(apiKey, crawlId);
86+
if (result.status === 'success' && result.result) {
87+
console.log(`\nCrawl completed. Result:`);
88+
console.log(JSON.stringify(result.result.llm_result, null, 2));
89+
break;
90+
} else if (result.status === 'failed') {
91+
console.log('\nCrawl failed. Result:');
92+
console.log(JSON.stringify(result, null, 2));
93+
break;
94+
} else {
95+
console.log(`Status: ${result.status}, waiting...`);
96+
}
97+
}
98+
} else {
99+
console.log('No crawl ID found in response. Synchronous result:');
100+
console.log(JSON.stringify(crawlResponse, null, 2));
101+
}
102+
} catch (error) {
103+
console.error('Error occurred:', error);
104+
}
105+
})();

scrapegraph-js/index.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ export { markdownify, getMarkdownifyRequest } from './src/markdownify.js';
33
export { searchScraper, getSearchScraperRequest } from './src/searchScraper.js';
44
export { getCredits } from './src/credits.js';
55
export { sendFeedback } from './src/feedback.js';
6+
export { crawl, getCrawlRequest } from './src/crawl.js';

scrapegraph-js/src/crawl.js

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import axios from 'axios';
2+
import handleError from './utils/handleError.js';
3+
import { ZodType } from 'zod';
4+
import { zodToJsonSchema } from 'zod-to-json-schema';
5+
6+
/**
7+
* Start a crawl job using the ScrapeGraphAI API.
8+
*
9+
* @param {string} apiKey - Your ScrapeGraph AI API key
10+
* @param {string} url - The starting URL for the crawl
11+
* @param {string} prompt - The prompt to guide the crawl and extraction
12+
* @param {Object|ZodType} schema - JSON schema or Zod schema defining the structure of the extracted data
13+
* @param {Object} [options] - Optional crawl parameters
14+
* @param {boolean} [options.cacheWebsite=true] - Whether to cache the website content
15+
* @param {number} [options.depth=2] - Maximum depth of the crawl (1-10)
16+
* @param {number} [options.maxPages=2] - Maximum number of pages to crawl (1-100)
17+
* @param {boolean} [options.sameDomainOnly=true] - Whether to only crawl pages from the same domain
18+
* @param {number} [options.batchSize=1] - Batch size for processing pages (1-10)
19+
* @returns {Promise<Object>} The crawl job response
20+
* @throws {Error} Throws an error if the HTTP request fails
21+
*/
22+
export async function crawl(
23+
apiKey,
24+
url,
25+
prompt,
26+
schema,
27+
options = {}
28+
) {
29+
const endpoint = 'https://api.scrapegraphai.com/v1/crawl';
30+
const headers = {
31+
'accept': 'application/json',
32+
'SGAI-APIKEY': apiKey,
33+
'Content-Type': 'application/json',
34+
};
35+
36+
let schemaPayload;
37+
if (schema instanceof ZodType) {
38+
schemaPayload = zodToJsonSchema(schema);
39+
} else if (typeof schema === 'object' && schema !== null) {
40+
schemaPayload = schema;
41+
} else {
42+
throw new Error('The schema must be a Zod schema or a plain object');
43+
}
44+
45+
const {
46+
cacheWebsite = true,
47+
depth = 2,
48+
maxPages = 2,
49+
sameDomainOnly = true,
50+
batchSize = 1,
51+
} = options;
52+
53+
const payload = {
54+
url,
55+
prompt,
56+
schema: schemaPayload,
57+
cache_website: cacheWebsite,
58+
depth,
59+
max_pages: maxPages,
60+
same_domain_only: sameDomainOnly,
61+
batch_size: batchSize,
62+
};
63+
64+
try {
65+
const response = await axios.post(endpoint, payload, { headers });
66+
return response.data;
67+
} catch (error) {
68+
handleError(error);
69+
}
70+
}
71+
72+
/**
73+
* Get the result of a crawl job by ID.
74+
*
75+
* @param {string} apiKey - Your ScrapeGraph AI API key
76+
* @param {string} crawlId - The crawl job ID
77+
* @returns {Promise<Object>} The crawl result
78+
* @throws {Error} Throws an error if the HTTP request fails
79+
*/
80+
export async function getCrawlRequest(apiKey, crawlId) {
81+
const endpoint = `https://api.scrapegraphai.com/v1/crawl/${crawlId}`;
82+
const headers = {
83+
'accept': 'application/json',
84+
'SGAI-APIKEY': apiKey,
85+
};
86+
87+
try {
88+
const response = await axios.get(endpoint, { headers });
89+
return response.data;
90+
} catch (error) {
91+
handleError(error);
92+
}
93+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SGAI_API_KEY="your_sgai_api_key"

0 commit comments

Comments
 (0)