diff --git a/README.md b/README.md
index 4e5c9be..4398429 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
English | 简体中文
-XCrawl is a Nodejs multifunctional crawler library. Provide configuration to batch fetch HTML, JSON, images, etc.
+XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration.
## Install
@@ -47,7 +47,7 @@ class XCrawl {
constructor(baseConfig?: IXCrawlBaseConifg)
fetchData(config: IFetchDataConfig): Promise>
fetchFile(config: IFetchFileConfig): Promise>
- fetchHTML(url: string): Promise
+ fetchHTML(config: string | IFetchHTMLConfig): Promise
}
```
@@ -130,7 +130,7 @@ fetchHTML is the method of the above English | 简体中文
-XCrawl 是 Nodejs 多功能爬虫库。提供配置即可批量抓取 HTML 、JSON、图片等等。
+XCrawl 是 Nodejs 多功能爬虫库。只需简单的配置即可抓取 HTML 、JSON、文件资源等等。
## 安装
@@ -294,7 +300,7 @@ class XCrawl {
constructor(baseConfig?: IXCrawlBaseConifg)
fetchData(config: IFetchDataConfig): Promise>
fetchFile(config: IFetchFileConfig): Promise>
- fetchHTML(url: string): Promise
+ fetchHTML(config: string | IFetchHTMLConfig): Promise
}
```
@@ -377,7 +383,7 @@ fetchHTML 是上面 myXCra
- 类型
```ts
-function fetchHTML(url: string): Promise
+function fetchHTML(config: string | IFetchHTMLConfig): Promise
```
- 示例
@@ -484,6 +490,12 @@ interface IFetchFileConfig extends IFetchBaseConifg {
}
```
+- IFetchHTMLConfig
+
+```ts
+interface IFetchHTMLConfig extends IRequestConfig {}
+```
+
## 更多
如有 **问题** 或 **需求** 请在 https://github.com/coder-hxl/x-crawl/issues 中提 **Issues** 。
diff --git a/package.json b/package.json
index 31c0416..8ceab25 100644
--- a/package.json
+++ b/package.json
@@ -1,9 +1,9 @@
{
"private": true,
"name": "x-crawl",
- "version": "0.1.0",
+ "version": "0.1.1",
"author": "CoderHxl",
- "description": "XCrawl is a Nodejs multifunctional crawler library.",
+ "description": "XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration.",
"license": "MIT",
"main": "src/index.ts",
"scripts": {
diff --git a/publish/README.md b/publish/README.md
index 4e5c9be..01ad89d 100644
--- a/publish/README.md
+++ b/publish/README.md
@@ -2,7 +2,7 @@
English | 简体中文
-XCrawl is a Nodejs multifunctional crawler library. Provide configuration to batch fetch HTML, JSON, images, etc.
+XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration.
## Install
@@ -249,7 +249,7 @@ If you have any **questions** or **needs** , please submit **Issues in** https:/
English | 简体中文
-XCrawl 是 Nodejs 多功能爬虫库。提供配置即可批量抓取 HTML 、JSON、图片等等。
+XCrawl 是 Nodejs 多功能爬虫库。只需简单的配置即可抓取 HTML 、JSON、文件资源等等。
## 安装
diff --git a/publish/package.json b/publish/package.json
index bf0136e..c891fb2 100644
--- a/publish/package.json
+++ b/publish/package.json
@@ -1,8 +1,8 @@
{
"name": "x-crawl",
- "version": "0.1.0",
+ "version": "0.1.1",
"author": "CoderHxl",
- "description": "XCrawl is a Nodejs multifunctional crawler library.",
+ "description": "XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration.",
"license": "MIT",
"keywords": [
"nodejs",
diff --git a/src/index.ts b/src/index.ts
index 9acd8a1..a868420 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -3,12 +3,13 @@ import path from 'node:path'
import { JSDOM } from 'jsdom'
import { batchRequest, request } from './request'
-import { isArray, isUndefined } from './utils'
+import { isArray, isString, isUndefined } from './utils'
import {
IXCrawlBaseConifg,
IFetchDataConfig,
IFetchFileConfig,
+ IFetchHTMLConfig,
IFetchBaseConifg,
IFileInfo,
IFetchCommon,
@@ -145,9 +146,13 @@ export default class XCrawl {
})
}
- async fetchHTML(url: string): Promise {
+ async fetchHTML(config: string | IFetchHTMLConfig): Promise {
+ const rawRequestConifg: IFetchHTMLConfig = isString(config)
+ ? { url: config }
+ : config
+
const { requestConifg } = mergeConfig(this.baseConfig, {
- requestConifg: { url }
+ requestConifg: rawRequestConifg
})
const requestResItem = await request(requestConifg)
diff --git a/src/types.ts b/src/types.ts
index 41573e7..55fb249 100644
--- a/src/types.ts
+++ b/src/types.ts
@@ -74,6 +74,8 @@ export interface IFetchFileConfig extends IFetchBaseConifg {
}
}
+export interface IFetchHTMLConfig extends IRequestConfig {}
+
export interface IFileInfo {
fileName: string
mimeType: string
diff --git a/test/start/index.js b/test/start/index.js
index dbd8882..a1c8eb1 100644
--- a/test/start/index.js
+++ b/test/start/index.js
@@ -1 +1 @@
-"use strict";var e=require("node:path"),t=require("node:fs"),o=require("jsdom"),n=require("node:http"),r=require("https"),s=require("node:url");function i(e){return new Promise((t=>setTimeout(t,e)))}function a(e,t=0){let o=Math.floor(Math.random()*e);return o{const r=c(e.data);e.data=r?e.data:JSON.stringify(e.data);const s=d(e),i=n.request(s,(e=>{const{statusCode:o,headers:n}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:o,headers:n,data:e})}))}));i.on("timeout",(()=>{o(new Error(`Timeout ${e.timeout}ms`))})),i.on("error",(e=>{o(e)})),"POST"!==s.method||r||i.write(e.data),i.end()}))}async function h(e,t,o){const n=e.length;let r=0;const s=!c(t),u="number"==typeof t;console.log(`Begin execution, total: ${n} `);for(const c of e){r++;let e="success",l=null,d={};try{d=await f(c)}catch(t){l=t,e=`error: ${t.message}`}if(o(l,{id:r,...d}),s&&r!==n){const o=u?t:a(t.max,t.min);console.log(`The ${r} request is ${e}, sleep for ${o}ms`),await i(o)}else console.log(`The ${r} request is ${e}`),console.log("All requests completed!")}}function m(e,t){const{baseUrl:o,timeout:n,intervalTime:r}=e,{requestConifg:s,intervalTime:i}=t,a=u(s)?s:[s];for(const e of a){const{url:t,timeout:r}=e;c(o)||(e.url=o+t),c(r)&&!c(n)&&(e.timeout=n)}return c(i)&&!c(r)&&(t.intervalTime=r),t}const g=new class{baseConfig;constructor(e={}){this.baseConfig=e}async fetchData(e){const{requestConifg:t,intervalTime:o}=m(this.baseConfig,e),n=u(t)?t:[t],r=[];return await h(n,o,((e,t)=>{if(e)return;const o=t.headers["content-type"]??"",n=t.data,s=o.includes("text")?n.toString():JSON.parse(n.toString());r.push({...t,data:s})})),r}fetchFile(o){return new Promise((n=>{const{requestConifg:r,intervalTime:s,fileConfig:i}=m(this.baseConfig,o),a=u(r)?r:[r],c=a.length;let l=0;const d=[];h(a,s,(function(o,r){if(o)return void(r.id===c&&n(d));const{id:s,statusCode:a,headers:u,data:f}=r,h=u["content-type"]??"",m=h.split("/").pop(),g=(new Date).getTime().toString(),p=e.resolve(i.storeDir,`${g}.${m}`);t.createWriteStream(p,"binary").write(f,(e=>{if(e)return console.log(`File save error at id ${s}: ${e.message}`);const t={fileName:g,mimeType:h,size:f.length,filePath:p};d.push({id:s,statusCode:a,headers:u,data:t}),++l!==c&&s!==c||n(d)}))}))}))}async fetchHTML(e){const{requestConifg:t}=m(this.baseConfig,{requestConifg:{url:e}}),n=await f(t);return new o.JSDOM(n.data)}}({timeout:1e4,intervalTime:{max:3e3,min:2e3}});g.fetchHTML("https://www.bilibili.com/").then((t=>{const o=t.window.document.querySelectorAll(".bili-video-card__cover"),n=[];o.forEach(((e,t)=>{const o=e.lastChild;t%2?n.push("https:"+o.src):n.push(o.src)})),console.log(n);const r=n.map((e=>({url:e})));g.fetchFile({requestConifg:r,fileConfig:{storeDir:e.resolve(__dirname,"./upload")}}).then((e=>{console.log(e)}))}));
+"use strict";var e=require("node:path"),t=require("node:fs"),o=require("jsdom"),n=require("node:http"),r=require("https"),s=require("node:url");function i(e){return new Promise((t=>setTimeout(t,e)))}function a(e,t=0){let o=Math.floor(Math.random()*e);return o{const r=c(e.data);e.data=r?e.data:JSON.stringify(e.data);const s=d(e),i=n.request(s,(e=>{const{statusCode:o,headers:n}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:o,headers:n,data:e})}))}));i.on("timeout",(()=>{o(new Error(`Timeout ${e.timeout}ms`))})),i.on("error",(e=>{o(e)})),"POST"!==s.method||r||i.write(e.data),i.end()}))}async function h(e,t,o){const n=e.length;let r=0;const s=!c(t),u="number"==typeof t;console.log(`Begin execution, total: ${n} `);for(const c of e){r++;let e="success",l=null,d={};try{d=await f(c)}catch(t){l=t,e=`error: ${t.message}`}if(o(l,{id:r,...d}),s&&r!==n){const o=u?t:a(t.max,t.min);console.log(`The ${r} request is ${e}, sleep for ${o}ms`),await i(o)}else console.log(`The ${r} request is ${e}`),console.log("All requests completed!")}}function m(e,t){const{baseUrl:o,timeout:n,intervalTime:r}=e,{requestConifg:s,intervalTime:i}=t,a=u(s)?s:[s];for(const e of a){const{url:t,timeout:r}=e;c(o)||(e.url=o+t),c(r)&&!c(n)&&(e.timeout=n)}return c(i)&&!c(r)&&(t.intervalTime=r),t}const g=new class{baseConfig;constructor(e={}){this.baseConfig=e}async fetchData(e){const{requestConifg:t,intervalTime:o}=m(this.baseConfig,e),n=u(t)?t:[t],r=[];return await h(n,o,((e,t)=>{if(e)return;const o=t.headers["content-type"]??"",n=t.data,s=o.includes("text")?n.toString():JSON.parse(n.toString());r.push({...t,data:s})})),r}fetchFile(o){return new Promise((n=>{const{requestConifg:r,intervalTime:s,fileConfig:i}=m(this.baseConfig,o),a=u(r)?r:[r],c=a.length,l=[];h(a,s,(function(o,r){if(o)return void(r.id===c&&n(l));const{id:s,statusCode:a,headers:u,data:d}=r,f=u["content-type"]??"",h=f.split("/").pop(),m=(new Date).getTime().toString(),g=e.resolve(i.storeDir,`${m}.${h}`);t.createWriteStream(g,"binary").write(d,(e=>{if(e)return console.log(`File save error at id ${s}: ${e.message}`);const t={fileName:m,mimeType:f,size:d.length,filePath:g};l.push({id:s,statusCode:a,headers:u,data:t}),s===c&&n(l)}))}))}))}async fetchHTML(e){const t="string"==typeof e?{url:e}:e;const{requestConifg:n}=m(this.baseConfig,{requestConifg:t}),r=await f(n);return new o.JSDOM(r.data)}}({timeout:1e4,intervalTime:{max:3e3,min:2e3}});g.fetchHTML({url:"https://www.bilibili.com/"}).then((t=>{const o=t.window.document.querySelectorAll(".bili-video-card__cover"),n=[];o.forEach(((e,t)=>{const o=e.lastChild;t%2?n.push("https:"+o.src):n.push(o.src)})),console.log(n);const r=n.map((e=>({url:e})));g.fetchFile({requestConifg:r,fileConfig:{storeDir:e.resolve(__dirname,"./upload")}}).then((e=>{console.log(e)}))}));
diff --git a/test/start/index.ts b/test/start/index.ts
index 6c5fc94..2ae5b14 100644
--- a/test/start/index.ts
+++ b/test/start/index.ts
@@ -22,7 +22,7 @@ const testXCrawl = new XCrawl({
// console.log(res)
// })
-testXCrawl.fetchHTML('https://www.bilibili.com/').then((jsdom) => {
+testXCrawl.fetchHTML({ url: 'https://www.bilibili.com/' }).then((jsdom) => {
const document = jsdom.window.document
const imgBoxEl = document.querySelectorAll('.bili-video-card__cover')