diff --git a/README.md b/README.md index 4e5c9be..4398429 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ English | 简体中文 -XCrawl is a Nodejs multifunctional crawler library. Provide configuration to batch fetch HTML, JSON, images, etc. +XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration. ## Install @@ -47,7 +47,7 @@ class XCrawl { constructor(baseConfig?: IXCrawlBaseConifg) fetchData(config: IFetchDataConfig): Promise> fetchFile(config: IFetchFileConfig): Promise> - fetchHTML(url: string): Promise + fetchHTML(config: string | IFetchHTMLConfig): Promise } ``` @@ -130,7 +130,7 @@ fetchHTML is the method of the above English | 简体中文 -XCrawl 是 Nodejs 多功能爬虫库。提供配置即可批量抓取 HTML 、JSON、图片等等。 +XCrawl 是 Nodejs 多功能爬虫库。只需简单的配置即可抓取 HTML 、JSON、文件资源等等。 ## 安装 @@ -294,7 +300,7 @@ class XCrawl { constructor(baseConfig?: IXCrawlBaseConifg) fetchData(config: IFetchDataConfig): Promise> fetchFile(config: IFetchFileConfig): Promise> - fetchHTML(url: string): Promise + fetchHTML(config: string | IFetchHTMLConfig): Promise } ``` @@ -377,7 +383,7 @@ fetchHTML 是上面 myXCra - 类型 ```ts -function fetchHTML(url: string): Promise +function fetchHTML(config: string | IFetchHTMLConfig): Promise ``` - 示例 @@ -484,6 +490,12 @@ interface IFetchFileConfig extends IFetchBaseConifg { } ``` +- IFetchHTMLConfig + +```ts +interface IFetchHTMLConfig extends IRequestConfig {} +``` + ## 更多 如有 **问题** 或 **需求** 请在 https://github.com/coder-hxl/x-crawl/issues 中提 **Issues** 。 diff --git a/package.json b/package.json index 31c0416..8ceab25 100644 --- a/package.json +++ b/package.json @@ -1,9 +1,9 @@ { "private": true, "name": "x-crawl", - "version": "0.1.0", + "version": "0.1.1", "author": "CoderHxl", - "description": "XCrawl is a Nodejs multifunctional crawler library.", + "description": "XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration.", "license": "MIT", "main": "src/index.ts", "scripts": { diff --git a/publish/README.md b/publish/README.md index 4e5c9be..01ad89d 100644 --- a/publish/README.md +++ b/publish/README.md @@ -2,7 +2,7 @@ English | 简体中文 -XCrawl is a Nodejs multifunctional crawler library. Provide configuration to batch fetch HTML, JSON, images, etc. +XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration. ## Install @@ -249,7 +249,7 @@ If you have any **questions** or **needs** , please submit **Issues in** https:/ English | 简体中文 -XCrawl 是 Nodejs 多功能爬虫库。提供配置即可批量抓取 HTML 、JSON、图片等等。 +XCrawl 是 Nodejs 多功能爬虫库。只需简单的配置即可抓取 HTML 、JSON、文件资源等等。 ## 安装 diff --git a/publish/package.json b/publish/package.json index bf0136e..c891fb2 100644 --- a/publish/package.json +++ b/publish/package.json @@ -1,8 +1,8 @@ { "name": "x-crawl", - "version": "0.1.0", + "version": "0.1.1", "author": "CoderHxl", - "description": "XCrawl is a Nodejs multifunctional crawler library.", + "description": "XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration.", "license": "MIT", "keywords": [ "nodejs", diff --git a/src/index.ts b/src/index.ts index 9acd8a1..a868420 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,12 +3,13 @@ import path from 'node:path' import { JSDOM } from 'jsdom' import { batchRequest, request } from './request' -import { isArray, isUndefined } from './utils' +import { isArray, isString, isUndefined } from './utils' import { IXCrawlBaseConifg, IFetchDataConfig, IFetchFileConfig, + IFetchHTMLConfig, IFetchBaseConifg, IFileInfo, IFetchCommon, @@ -145,9 +146,13 @@ export default class XCrawl { }) } - async fetchHTML(url: string): Promise { + async fetchHTML(config: string | IFetchHTMLConfig): Promise { + const rawRequestConifg: IFetchHTMLConfig = isString(config) + ? { url: config } + : config + const { requestConifg } = mergeConfig(this.baseConfig, { - requestConifg: { url } + requestConifg: rawRequestConifg }) const requestResItem = await request(requestConifg) diff --git a/src/types.ts b/src/types.ts index 41573e7..55fb249 100644 --- a/src/types.ts +++ b/src/types.ts @@ -74,6 +74,8 @@ export interface IFetchFileConfig extends IFetchBaseConifg { } } +export interface IFetchHTMLConfig extends IRequestConfig {} + export interface IFileInfo { fileName: string mimeType: string diff --git a/test/start/index.js b/test/start/index.js index dbd8882..a1c8eb1 100644 --- a/test/start/index.js +++ b/test/start/index.js @@ -1 +1 @@ -"use strict";var e=require("node:path"),t=require("node:fs"),o=require("jsdom"),n=require("node:http"),r=require("https"),s=require("node:url");function i(e){return new Promise((t=>setTimeout(t,e)))}function a(e,t=0){let o=Math.floor(Math.random()*e);return o{const r=c(e.data);e.data=r?e.data:JSON.stringify(e.data);const s=d(e),i=n.request(s,(e=>{const{statusCode:o,headers:n}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:o,headers:n,data:e})}))}));i.on("timeout",(()=>{o(new Error(`Timeout ${e.timeout}ms`))})),i.on("error",(e=>{o(e)})),"POST"!==s.method||r||i.write(e.data),i.end()}))}async function h(e,t,o){const n=e.length;let r=0;const s=!c(t),u="number"==typeof t;console.log(`Begin execution, total: ${n} `);for(const c of e){r++;let e="success",l=null,d={};try{d=await f(c)}catch(t){l=t,e=`error: ${t.message}`}if(o(l,{id:r,...d}),s&&r!==n){const o=u?t:a(t.max,t.min);console.log(`The ${r} request is ${e}, sleep for ${o}ms`),await i(o)}else console.log(`The ${r} request is ${e}`),console.log("All requests completed!")}}function m(e,t){const{baseUrl:o,timeout:n,intervalTime:r}=e,{requestConifg:s,intervalTime:i}=t,a=u(s)?s:[s];for(const e of a){const{url:t,timeout:r}=e;c(o)||(e.url=o+t),c(r)&&!c(n)&&(e.timeout=n)}return c(i)&&!c(r)&&(t.intervalTime=r),t}const g=new class{baseConfig;constructor(e={}){this.baseConfig=e}async fetchData(e){const{requestConifg:t,intervalTime:o}=m(this.baseConfig,e),n=u(t)?t:[t],r=[];return await h(n,o,((e,t)=>{if(e)return;const o=t.headers["content-type"]??"",n=t.data,s=o.includes("text")?n.toString():JSON.parse(n.toString());r.push({...t,data:s})})),r}fetchFile(o){return new Promise((n=>{const{requestConifg:r,intervalTime:s,fileConfig:i}=m(this.baseConfig,o),a=u(r)?r:[r],c=a.length;let l=0;const d=[];h(a,s,(function(o,r){if(o)return void(r.id===c&&n(d));const{id:s,statusCode:a,headers:u,data:f}=r,h=u["content-type"]??"",m=h.split("/").pop(),g=(new Date).getTime().toString(),p=e.resolve(i.storeDir,`${g}.${m}`);t.createWriteStream(p,"binary").write(f,(e=>{if(e)return console.log(`File save error at id ${s}: ${e.message}`);const t={fileName:g,mimeType:h,size:f.length,filePath:p};d.push({id:s,statusCode:a,headers:u,data:t}),++l!==c&&s!==c||n(d)}))}))}))}async fetchHTML(e){const{requestConifg:t}=m(this.baseConfig,{requestConifg:{url:e}}),n=await f(t);return new o.JSDOM(n.data)}}({timeout:1e4,intervalTime:{max:3e3,min:2e3}});g.fetchHTML("https://www.bilibili.com/").then((t=>{const o=t.window.document.querySelectorAll(".bili-video-card__cover"),n=[];o.forEach(((e,t)=>{const o=e.lastChild;t%2?n.push("https:"+o.src):n.push(o.src)})),console.log(n);const r=n.map((e=>({url:e})));g.fetchFile({requestConifg:r,fileConfig:{storeDir:e.resolve(__dirname,"./upload")}}).then((e=>{console.log(e)}))})); +"use strict";var e=require("node:path"),t=require("node:fs"),o=require("jsdom"),n=require("node:http"),r=require("https"),s=require("node:url");function i(e){return new Promise((t=>setTimeout(t,e)))}function a(e,t=0){let o=Math.floor(Math.random()*e);return o{const r=c(e.data);e.data=r?e.data:JSON.stringify(e.data);const s=d(e),i=n.request(s,(e=>{const{statusCode:o,headers:n}=e,r=[];e.on("data",(e=>r.push(e))),e.on("end",(()=>{const e=Buffer.concat(r);t({statusCode:o,headers:n,data:e})}))}));i.on("timeout",(()=>{o(new Error(`Timeout ${e.timeout}ms`))})),i.on("error",(e=>{o(e)})),"POST"!==s.method||r||i.write(e.data),i.end()}))}async function h(e,t,o){const n=e.length;let r=0;const s=!c(t),u="number"==typeof t;console.log(`Begin execution, total: ${n} `);for(const c of e){r++;let e="success",l=null,d={};try{d=await f(c)}catch(t){l=t,e=`error: ${t.message}`}if(o(l,{id:r,...d}),s&&r!==n){const o=u?t:a(t.max,t.min);console.log(`The ${r} request is ${e}, sleep for ${o}ms`),await i(o)}else console.log(`The ${r} request is ${e}`),console.log("All requests completed!")}}function m(e,t){const{baseUrl:o,timeout:n,intervalTime:r}=e,{requestConifg:s,intervalTime:i}=t,a=u(s)?s:[s];for(const e of a){const{url:t,timeout:r}=e;c(o)||(e.url=o+t),c(r)&&!c(n)&&(e.timeout=n)}return c(i)&&!c(r)&&(t.intervalTime=r),t}const g=new class{baseConfig;constructor(e={}){this.baseConfig=e}async fetchData(e){const{requestConifg:t,intervalTime:o}=m(this.baseConfig,e),n=u(t)?t:[t],r=[];return await h(n,o,((e,t)=>{if(e)return;const o=t.headers["content-type"]??"",n=t.data,s=o.includes("text")?n.toString():JSON.parse(n.toString());r.push({...t,data:s})})),r}fetchFile(o){return new Promise((n=>{const{requestConifg:r,intervalTime:s,fileConfig:i}=m(this.baseConfig,o),a=u(r)?r:[r],c=a.length,l=[];h(a,s,(function(o,r){if(o)return void(r.id===c&&n(l));const{id:s,statusCode:a,headers:u,data:d}=r,f=u["content-type"]??"",h=f.split("/").pop(),m=(new Date).getTime().toString(),g=e.resolve(i.storeDir,`${m}.${h}`);t.createWriteStream(g,"binary").write(d,(e=>{if(e)return console.log(`File save error at id ${s}: ${e.message}`);const t={fileName:m,mimeType:f,size:d.length,filePath:g};l.push({id:s,statusCode:a,headers:u,data:t}),s===c&&n(l)}))}))}))}async fetchHTML(e){const t="string"==typeof e?{url:e}:e;const{requestConifg:n}=m(this.baseConfig,{requestConifg:t}),r=await f(n);return new o.JSDOM(r.data)}}({timeout:1e4,intervalTime:{max:3e3,min:2e3}});g.fetchHTML({url:"https://www.bilibili.com/"}).then((t=>{const o=t.window.document.querySelectorAll(".bili-video-card__cover"),n=[];o.forEach(((e,t)=>{const o=e.lastChild;t%2?n.push("https:"+o.src):n.push(o.src)})),console.log(n);const r=n.map((e=>({url:e})));g.fetchFile({requestConifg:r,fileConfig:{storeDir:e.resolve(__dirname,"./upload")}}).then((e=>{console.log(e)}))})); diff --git a/test/start/index.ts b/test/start/index.ts index 6c5fc94..2ae5b14 100644 --- a/test/start/index.ts +++ b/test/start/index.ts @@ -22,7 +22,7 @@ const testXCrawl = new XCrawl({ // console.log(res) // }) -testXCrawl.fetchHTML('https://www.bilibili.com/').then((jsdom) => { +testXCrawl.fetchHTML({ url: 'https://www.bilibili.com/' }).then((jsdom) => { const document = jsdom.window.document const imgBoxEl = document.querySelectorAll('.bili-video-card__cover')