diff --git a/.gitignore b/.gitignore index f0567b90..f010ea32 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ package-lock.json npm-debug.log coverage test/e2e/results +.nyc-output diff --git a/lib/config/defaults.js b/lib/config/defaults.js index 1e79fe63..8780e83e 100644 --- a/lib/config/defaults.js +++ b/lib/config/defaults.js @@ -48,8 +48,7 @@ const config = { ], request: { throwHttpErrors: false, - encoding: 'binary', - //cookieJar: true, + responseType: 'buffer', decompress: true, headers: { 'user-agent': defaultRequestUserAgent diff --git a/lib/plugins/save-resource-to-fs-plugin.js b/lib/plugins/save-resource-to-fs-plugin.js index 0e7be17c..dfb5e3ba 100644 --- a/lib/plugins/save-resource-to-fs-plugin.js +++ b/lib/plugins/save-resource-to-fs-plugin.js @@ -20,7 +20,8 @@ class SaveResourceToFileSystemPlugin { registerAction('saveResource', async ({resource}) => { const filename = path.join(absoluteDirectoryPath, resource.getFilename()); const text = resource.getText(); - await fs.outputFile(filename, text, { encoding: 'binary' }); + const encoding = typeof text === 'string' ? 'utf-8' : 'binary'; + await fs.outputFile(filename, text, { encoding }); loadedResources.push(resource); }); diff --git a/lib/request.js b/lib/request.js index 4ea4e76b..8d993093 100644 --- a/lib/request.js +++ b/lib/request.js @@ -1,18 +1,24 @@ import got from 'got'; import logger from './logger.js'; -import { extend, isPlainObject } from './utils/index.js'; +import types from './config/resource-types.js'; +import { extend, isPlainObject, getTypeByMime } from './utils/index.js'; + +const TEXT_RESOURCE_TYPES = [types.html, types.css]; function getMimeType (contentType) { return contentType ? contentType.split(';')[0] : null; } -function defaultResponseHandler ({response}) { - return Promise.resolve(response.body); +function defaultResponseHandler ({response, type}) { + if (TEXT_RESOURCE_TYPES.includes(type)) { + return response.body.toString(); + } + return response.body; } function transformResult (result) { switch (true) { - case typeof result === 'string': + case typeof result === 'string' || Buffer.isBuffer(result): return { body: result, metadata: null @@ -41,14 +47,19 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR const response = await got(requestOptions); logger.debug(`[request] received response for ${response.url}, statusCode ${response.statusCode}`); - const responseHandlerResult = transformResult(await afterResponse({response})); + + const mimeType = getMimeType(response.headers['content-type']); + const resourceType = getTypeByMime(mimeType); + + const responseHandlerResult = transformResult(await afterResponse({ response, type: resourceType })); if (!responseHandlerResult) { return null; } return { url: response.url, - mimeType: getMimeType(response.headers['content-type']), + type: resourceType, + mimeType, body: responseHandlerResult.body, metadata: responseHandlerResult.metadata }; diff --git a/lib/scraper.js b/lib/scraper.js index 040a9cd9..339aa301 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -13,7 +13,7 @@ import { } from './plugins/index.js'; import * as utils from './utils/index.js'; -const { extend, union, urlsEqual, getTypeByMime, getTypeByFilename, series } = utils; +const { extend, union, urlsEqual, getTypeByFilename, series } = utils; import NormalizedUrlMap from './utils/normalized-url-map.js'; const actionNames = [ @@ -170,7 +170,7 @@ class Scraper { self.requestedResourcePromises.set(responseData.url, requestPromise); } - resource.setType(getTypeByMime(responseData.mimeType)); + resource.setType(responseData.type); const { filename } = await self.runActions('generateFilename', { resource, responseData }); resource.setFilename(filename); diff --git a/package.json b/package.json index d26cc597..c05271a0 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "website-scraper", - "version": "5.0.0", + "version": "5.1.0", "description": "Download website to a local directory (including all css, images, js, etc.)", "readmeFilename": "README.md", "type": "module", @@ -38,7 +38,7 @@ }, "homepage": "https://github.com/website-scraper/node-website-scraper", "dependencies": { - "cheerio": "1.0.0-rc.10", + "cheerio": "1.0.0-rc.11", "css-url-parser": "^1.0.0", "debug": "^4.3.1", "fs-extra": "^10.0.0", @@ -52,10 +52,10 @@ "devDependencies": { "c8": "^7.7.2", "eslint": "^8.5.0", - "mocha": "^9.1.3", + "mocha": "^10.0.0", "nock": "^13.0.11", "should": "^13.2.3", - "sinon": "^12.0.1" + "sinon": "^14.0.0" }, "files": [ "index.mjs", diff --git a/test/functional/base/base.test.js b/test/functional/base/base.test.js index 383bbda8..8ac3dc3f 100644 --- a/test/functional/base/base.test.js +++ b/test/functional/base/base.test.js @@ -51,15 +51,15 @@ describe('Functional: base', function() { nock('http://blog.example.com/').get('/').replyWithFile(200, mockDirname + '/blog.html', {'content-type': 'text/html'}); // mock sources for index.html - nock('http://example.com/').get('/index.css').replyWithFile(200, mockDirname + '/index.css'); + nock('http://example.com/').get('/index.css').replyWithFile(200, mockDirname + '/index.css', {'content-type': 'text/css'}); nock('http://example.com/').get('/background.png').reply(200, 'OK'); nock('http://example.com/').get('/cat.jpg').reply(200, 'OK'); nock('http://example.com/').get('/script.min.js').reply(200, 'OK'); // mock sources for index.css - nock('http://example.com/').get('/files/index-import-1.css').reply(200, 'OK'); - nock('http://example.com/').get('/files/index-import-2.css').replyWithFile(200, mockDirname + '/index-import-2.css'); - nock('http://example.com/').get('/files/index-import-3.css').reply(200, 'OK'); + nock('http://example.com/').get('/files/index-import-1.css').reply(200, 'OK', {'content-type': 'text/css'}); + nock('http://example.com/').get('/files/index-import-2.css').replyWithFile(200, mockDirname + '/index-import-2.css', {'content-type': 'text/css'}); + nock('http://example.com/').get('/files/index-import-3.css').reply(200, 'OK', {'content-type': 'text/css'}); nock('http://example.com/').get('/files/index-image-1.png').reply(200, 'OK'); nock('http://example.com/').get('/files/index-image-2.png').reply(200, 'OK'); diff --git a/test/functional/binary-resources/images.test.js b/test/functional/binary-resources/images.test.js new file mode 100644 index 00000000..05d07916 --- /dev/null +++ b/test/functional/binary-resources/images.test.js @@ -0,0 +1,69 @@ +import should from 'should'; +import '../../utils/assertions.js'; +import nock from 'nock'; +import fs from 'fs-extra'; +import cheerio from 'cheerio'; +import scrape from 'website-scraper'; + +const testDirname = './test/functional/binary-resources/.tmp'; +const mockDirname = './test/functional/binary-resources/mocks'; + +describe('Functional: images', () => { + const options = { + urls: [ 'http://example.com/' ], + directory: testDirname, + subdirectories: [ + { directory: 'img', extensions: ['.jpg', '.png'] } + ], + sources: [ + { selector: 'img', attr: 'src' } + ], + ignoreErrors: false + }; + + beforeEach(() => { + nock.cleanAll(); + nock.disableNetConnect(); + }); + + afterEach(() => { + nock.cleanAll(); + nock.enableNetConnect(); + fs.removeSync(testDirname); + }); + + beforeEach(() => { + // mock base urls + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); + + // mock sources for index.html + nock('http://example.com/').get('/test-image.png').replyWithFile(200, mockDirname + '/test-image.png', {'content-type': 'image/png'}); + nock('http://example.com/').get('/test-image.jpg').replyWithFile(200, mockDirname + '/test-image.jpg', {'content-type': 'image/jpeg'}); + }); + + it('should load images and save content correctly', async () => { + await scrape(options); + + // should create directory and subdirectories + fs.existsSync(testDirname).should.be.eql(true); + fs.existsSync(testDirname + '/img').should.be.eql(true); + + // should contain all sources found in index.html + fs.existsSync(testDirname + '/img/test-image.png').should.be.eql(true); + fs.existsSync(testDirname + '/img/test-image.jpg').should.be.eql(true); + + // all sources in index.html should be replaced with local paths + let $ = cheerio.load(fs.readFileSync(testDirname + '/index.html').toString()); + $('img.png').attr('src').should.be.eql('img/test-image.png'); + $('img.jpg').attr('src').should.be.eql('img/test-image.jpg'); + + // content of downloaded images should equal original images + const originalPng = fs.readFileSync(mockDirname + '/test-image.png'); + const originalJpg = fs.readFileSync(mockDirname + '/test-image.jpg'); + const resultPng = fs.readFileSync(testDirname + '/img/test-image.png'); + const resultJpg = fs.readFileSync(testDirname + '/img/test-image.jpg'); + + should(resultPng).be.eql(originalPng); + should(resultJpg).be.eql(originalJpg); + }); +}); diff --git a/test/functional/binary-resources/mocks/index.html b/test/functional/binary-resources/mocks/index.html new file mode 100644 index 00000000..20f1cf2a --- /dev/null +++ b/test/functional/binary-resources/mocks/index.html @@ -0,0 +1,11 @@ + + + + + Index + + + + + + \ No newline at end of file diff --git a/test/functional/binary-resources/mocks/test-image.jpg b/test/functional/binary-resources/mocks/test-image.jpg new file mode 100644 index 00000000..2fc54b46 Binary files /dev/null and b/test/functional/binary-resources/mocks/test-image.jpg differ diff --git a/test/functional/binary-resources/mocks/test-image.png b/test/functional/binary-resources/mocks/test-image.png new file mode 100644 index 00000000..9c9ee685 Binary files /dev/null and b/test/functional/binary-resources/mocks/test-image.png differ diff --git a/test/functional/callbacks/callbacks.test.js b/test/functional/callbacks/callbacks.test.js index 1e07e916..7f6b3279 100644 --- a/test/functional/callbacks/callbacks.test.js +++ b/test/functional/callbacks/callbacks.test.js @@ -6,7 +6,6 @@ import sinon from 'sinon'; import scrape from 'website-scraper'; const testDirname = './test/functional/callbacks/.tmp'; -const mockDirname = './test/functional/base/mocks'; describe('Functional: onResourceSaved and onResourceError callbacks in plugin', () => { diff --git a/test/functional/circular-dependencies/circular-dependencies.test.js b/test/functional/circular-dependencies/circular-dependencies.test.js index 428669e1..d5a5c229 100644 --- a/test/functional/circular-dependencies/circular-dependencies.test.js +++ b/test/functional/circular-dependencies/circular-dependencies.test.js @@ -34,10 +34,10 @@ describe('Functional circular dependencies', function() { ] }; - nock('http://example.com/').get('/index.html').replyWithFile(200, mockDirname + '/index.html'); - nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html'); - nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css'); - nock('http://example.com/').get('/style2.css').replyWithFile(200, mockDirname + '/style2.css'); + nock('http://example.com/').get('/index.html').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); + nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html', {'content-type': 'text/html'}); + nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'}); + nock('http://example.com/').get('/style2.css').replyWithFile(200, mockDirname + '/style2.css', {'content-type': 'text/css'}); return scrape(options).then(function() { fs.existsSync(testDirname + '/index.html').should.be.eql(true); diff --git a/test/functional/css-handling/css-handling.test.js b/test/functional/css-handling/css-handling.test.js index afb3cd64..211b2b1f 100644 --- a/test/functional/css-handling/css-handling.test.js +++ b/test/functional/css-handling/css-handling.test.js @@ -21,11 +21,11 @@ describe('Functional: css handling', function() { }); it('should correctly handle css files, style tags and style attributes and ignore css-like text inside common html tags', function() { - nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); - nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css'); + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); + nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'}); - nock('http://example.com/').get('/style-import-1.css').reply(200, 'style-import-1.css'); - nock('http://example.com/').get('/style-import-2.css').reply(200, 'style-import-2.css'); + nock('http://example.com/').get('/style-import-1.css').reply(200, 'style-import-1.css', {'content-type': 'text/css'}); + nock('http://example.com/').get('/style-import-2.css').reply(200, 'style-import-2.css', {'content-type': 'text/css'}); nock('http://example.com/').get('/style-tag.png').reply(200, 'style-tag.png'); nock('http://example.com/').get('/style-attr.png').reply(200, 'style-attr.png'); nock('http://example.com/').get('/css-like-text-in-html.png').reply(200, 'css-like-text-in-html.png'); diff --git a/test/functional/encoding/hieroglyphs.test.js b/test/functional/encoding/hieroglyphs.test.js new file mode 100644 index 00000000..3081df2b --- /dev/null +++ b/test/functional/encoding/hieroglyphs.test.js @@ -0,0 +1,41 @@ +import '../../utils/assertions.js'; +import nock from 'nock'; +import fs from 'fs-extra'; +import scrape from 'website-scraper'; + +const testDirname = './test/functional/encoding/.tmp'; +const mockDirname = './test/functional/encoding/mocks'; + +describe('Functional: Korean characters are properly encoded/decoded', function() { + const options = { + urls: [ + 'http://example.com/', + ], + directory: testDirname, + ignoreErrors: false + }; + + beforeEach(function() { + nock.cleanAll(); + nock.disableNetConnect(); + }); + + afterEach(function() { + nock.cleanAll(); + nock.enableNetConnect(); + fs.removeSync(testDirname); + }); + + beforeEach(() => { + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); + }); + + it('should save the page in the same data as it was originally', () => { + return scrape(options).then(function(result) { + const scrapedIndex = fs.readFileSync(testDirname + '/index.html').toString(); + scrapedIndex.should.be.containEql('
저는 7년 동안 한국에서 살았어요.
'); + scrapedIndex.should.be.containEql('
Слава Україні!
'); + scrapedIndex.should.be.containEql('
加入网站
'); + }); + }); +}); diff --git a/test/functional/encoding/mocks/index.html b/test/functional/encoding/mocks/index.html new file mode 100644 index 00000000..8d724cc2 --- /dev/null +++ b/test/functional/encoding/mocks/index.html @@ -0,0 +1,12 @@ + + + + + Test + + +
저는 7년 동안 한국에서 살았어요.
+
Слава Україні!
+
加入网站
+ + diff --git a/test/functional/html-entities/html-entities.test.js b/test/functional/html-entities/html-entities.test.js index 4087628b..cb682dd5 100644 --- a/test/functional/html-entities/html-entities.test.js +++ b/test/functional/html-entities/html-entities.test.js @@ -21,8 +21,8 @@ describe('Functional: html entities', function() { }); it('should decode all html-entities found in html files and not encode entities from css file', function() { - nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); - nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css'); + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); + nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'}); // in index.html // /fonts?family=Myriad&v=2 => /fonts?family=Myriad&v=2 diff --git a/test/functional/redirect/redirect.test.js b/test/functional/redirect/redirect.test.js index 4a25dc63..a7e4f16f 100644 --- a/test/functional/redirect/redirect.test.js +++ b/test/functional/redirect/redirect.test.js @@ -23,16 +23,16 @@ describe('Functional redirects', function() { }); it('should follow redirects and save resource once if it has different urls', function() { - nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html'); + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'}); // true page - ok - nock('http://example.com/').get('/true-page.html').reply(200, ' true page 1'); + nock('http://example.com/').get('/true-page.html').reply(200, ' true page 1', {'content-type': 'text/html'}); // duplicating page - redirect to true page nock('http://example.com/').get('/duplicating-page.html').reply(302, '', {'Location': 'http://example.com/true-page.html'}); nock('http://example.com/').get('/true-page.html').reply(200, 'true page 2'); // duplicating site - redirect to duplicating page, then redirect to true page nock('http://duplicating.another-site.com/').get('/').reply(302, '', {'Location': 'http://example.com/duplicating-page.html'}); nock('http://example.com/').get('/duplicating-page.html').reply(302, '', {'Location': 'http://example.com/true-page.html'}); - nock('http://example.com/').get('/true-page.html').reply(200, 'true page 3'); + nock('http://example.com/').get('/true-page.html').reply(200, 'true page 3', {'content-type': 'text/html'}); const options = { urls: [ 'http://example.com/' ], @@ -79,11 +79,11 @@ describe('Functional redirects', function() { ] }; - nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/relative-resources-index.html'); + nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/relative-resources-index.html', {'content-type': 'text/html'}); nock('http://example.com/').get('/about').reply(301, '', {'Location': 'http://example.com/about/'}); nock('http://example.com/').get('/about/').replyWithFile(200, mockDirname + '/relative-resources-about.html', {'content-type': 'text/html'}); - nock('http://example.com/').get('/style.css').reply(200, 'style.css'); - nock('http://example.com/').get('/about/style.css').reply(200, 'about/style.css'); + nock('http://example.com/').get('/style.css').reply(200, 'style.css', {'content-type': 'text/css'}); + nock('http://example.com/').get('/about/style.css').reply(200, 'about/style.css', {'content-type': 'text/css'}); return scrape(options).then(function() { fs.existsSync(testDirname + '/index.html').should.be.eql(true); diff --git a/test/unit/scraper-init-test.js b/test/unit/scraper-init-test.js index 9612e180..080c5976 100644 --- a/test/unit/scraper-init-test.js +++ b/test/unit/scraper-init-test.js @@ -121,7 +121,7 @@ describe('Scraper initialization', function () { s.options.request.should.containEql({ throwHttpErrors: false, - encoding: 'binary', + responseType: 'buffer', decompress: true, https: { rejectUnauthorized: false @@ -143,7 +143,7 @@ describe('Scraper initialization', function () { s.options.request.should.eql({ throwHttpErrors: true, - encoding: 'binary', + responseType: 'buffer', decompress: true, https: { rejectUnauthorized: false diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js index 87e566e7..87ee177c 100644 --- a/test/unit/scraper-test.js +++ b/test/unit/scraper-test.js @@ -103,7 +103,7 @@ describe('Scraper', () => { rr.should.be.eql(r); rr.getUrl().should.be.eql('http://example.com/a.png'); rr.getFilename().should.be.not.empty(); - rr.getText().should.be.eql('OK'); + rr.getText().should.be.not.empty(); }); it('should return null if the urlFilter returns false', async () =>{ @@ -138,7 +138,7 @@ describe('Scraper', () => { rr.should.be.eql(r); rr.getUrl().should.be.eql('http://example.com'); rr.getFilename().should.be.not.empty(); - rr.getText().should.be.eql('OK'); + rr.getText().should.be.not.empty(); }); }); @@ -160,7 +160,7 @@ describe('Scraper', () => { rr.should.be.eql(r); rr.getUrl().should.be.eql('http://example.com/a.png'); rr.getFilename().should.be.not.empty(); - rr.getText().should.be.eql('OK'); + rr.getText().should.be.not.empty(); }); it('should request the resource if maxDepth is set and resource depth is less than maxDept', async () =>{ @@ -181,7 +181,7 @@ describe('Scraper', () => { rr.should.be.eql(r); rr.getUrl().should.be.eql('http://example.com/a.png'); rr.getFilename().should.be.not.empty(); - rr.getText().should.be.eql('OK'); + rr.getText().should.be.not.empty(); }); it('should request the resource if maxDepth is set and resource depth is equal to maxDept', async () =>{ @@ -201,7 +201,7 @@ describe('Scraper', () => { rr.should.be.eql(r); rr.getUrl().should.be.eql('http://example.com/a.png'); rr.getFilename().should.be.not.empty(); - rr.getText().should.be.eql('OK'); + rr.getText().should.be.not.empty(); }); it('should return null if maxDepth is set and resource depth is greater than maxDepth', async () =>{