diff --git a/.gitignore b/.gitignore
index f0567b90..f010ea32 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ package-lock.json
npm-debug.log
coverage
test/e2e/results
+.nyc-output
diff --git a/lib/config/defaults.js b/lib/config/defaults.js
index 1e79fe63..8780e83e 100644
--- a/lib/config/defaults.js
+++ b/lib/config/defaults.js
@@ -48,8 +48,7 @@ const config = {
],
request: {
throwHttpErrors: false,
- encoding: 'binary',
- //cookieJar: true,
+ responseType: 'buffer',
decompress: true,
headers: {
'user-agent': defaultRequestUserAgent
diff --git a/lib/plugins/save-resource-to-fs-plugin.js b/lib/plugins/save-resource-to-fs-plugin.js
index 0e7be17c..dfb5e3ba 100644
--- a/lib/plugins/save-resource-to-fs-plugin.js
+++ b/lib/plugins/save-resource-to-fs-plugin.js
@@ -20,7 +20,8 @@ class SaveResourceToFileSystemPlugin {
registerAction('saveResource', async ({resource}) => {
const filename = path.join(absoluteDirectoryPath, resource.getFilename());
const text = resource.getText();
- await fs.outputFile(filename, text, { encoding: 'binary' });
+ const encoding = typeof text === 'string' ? 'utf-8' : 'binary';
+ await fs.outputFile(filename, text, { encoding });
loadedResources.push(resource);
});
diff --git a/lib/request.js b/lib/request.js
index 4ea4e76b..8d993093 100644
--- a/lib/request.js
+++ b/lib/request.js
@@ -1,18 +1,24 @@
import got from 'got';
import logger from './logger.js';
-import { extend, isPlainObject } from './utils/index.js';
+import types from './config/resource-types.js';
+import { extend, isPlainObject, getTypeByMime } from './utils/index.js';
+
+const TEXT_RESOURCE_TYPES = [types.html, types.css];
function getMimeType (contentType) {
return contentType ? contentType.split(';')[0] : null;
}
-function defaultResponseHandler ({response}) {
- return Promise.resolve(response.body);
+function defaultResponseHandler ({response, type}) {
+ if (TEXT_RESOURCE_TYPES.includes(type)) {
+ return response.body.toString();
+ }
+ return response.body;
}
function transformResult (result) {
switch (true) {
- case typeof result === 'string':
+ case typeof result === 'string' || Buffer.isBuffer(result):
return {
body: result,
metadata: null
@@ -41,14 +47,19 @@ async function getRequest ({url, referer, options = {}, afterResponse = defaultR
const response = await got(requestOptions);
logger.debug(`[request] received response for ${response.url}, statusCode ${response.statusCode}`);
- const responseHandlerResult = transformResult(await afterResponse({response}));
+
+ const mimeType = getMimeType(response.headers['content-type']);
+ const resourceType = getTypeByMime(mimeType);
+
+ const responseHandlerResult = transformResult(await afterResponse({ response, type: resourceType }));
if (!responseHandlerResult) {
return null;
}
return {
url: response.url,
- mimeType: getMimeType(response.headers['content-type']),
+ type: resourceType,
+ mimeType,
body: responseHandlerResult.body,
metadata: responseHandlerResult.metadata
};
diff --git a/lib/scraper.js b/lib/scraper.js
index 040a9cd9..339aa301 100644
--- a/lib/scraper.js
+++ b/lib/scraper.js
@@ -13,7 +13,7 @@ import {
} from './plugins/index.js';
import * as utils from './utils/index.js';
-const { extend, union, urlsEqual, getTypeByMime, getTypeByFilename, series } = utils;
+const { extend, union, urlsEqual, getTypeByFilename, series } = utils;
import NormalizedUrlMap from './utils/normalized-url-map.js';
const actionNames = [
@@ -170,7 +170,7 @@ class Scraper {
self.requestedResourcePromises.set(responseData.url, requestPromise);
}
- resource.setType(getTypeByMime(responseData.mimeType));
+ resource.setType(responseData.type);
const { filename } = await self.runActions('generateFilename', { resource, responseData });
resource.setFilename(filename);
diff --git a/package.json b/package.json
index d26cc597..c05271a0 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "website-scraper",
- "version": "5.0.0",
+ "version": "5.1.0",
"description": "Download website to a local directory (including all css, images, js, etc.)",
"readmeFilename": "README.md",
"type": "module",
@@ -38,7 +38,7 @@
},
"homepage": "https://github.com/website-scraper/node-website-scraper",
"dependencies": {
- "cheerio": "1.0.0-rc.10",
+ "cheerio": "1.0.0-rc.11",
"css-url-parser": "^1.0.0",
"debug": "^4.3.1",
"fs-extra": "^10.0.0",
@@ -52,10 +52,10 @@
"devDependencies": {
"c8": "^7.7.2",
"eslint": "^8.5.0",
- "mocha": "^9.1.3",
+ "mocha": "^10.0.0",
"nock": "^13.0.11",
"should": "^13.2.3",
- "sinon": "^12.0.1"
+ "sinon": "^14.0.0"
},
"files": [
"index.mjs",
diff --git a/test/functional/base/base.test.js b/test/functional/base/base.test.js
index 383bbda8..8ac3dc3f 100644
--- a/test/functional/base/base.test.js
+++ b/test/functional/base/base.test.js
@@ -51,15 +51,15 @@ describe('Functional: base', function() {
nock('http://blog.example.com/').get('/').replyWithFile(200, mockDirname + '/blog.html', {'content-type': 'text/html'});
// mock sources for index.html
- nock('http://example.com/').get('/index.css').replyWithFile(200, mockDirname + '/index.css');
+ nock('http://example.com/').get('/index.css').replyWithFile(200, mockDirname + '/index.css', {'content-type': 'text/css'});
nock('http://example.com/').get('/background.png').reply(200, 'OK');
nock('http://example.com/').get('/cat.jpg').reply(200, 'OK');
nock('http://example.com/').get('/script.min.js').reply(200, 'OK');
// mock sources for index.css
- nock('http://example.com/').get('/files/index-import-1.css').reply(200, 'OK');
- nock('http://example.com/').get('/files/index-import-2.css').replyWithFile(200, mockDirname + '/index-import-2.css');
- nock('http://example.com/').get('/files/index-import-3.css').reply(200, 'OK');
+ nock('http://example.com/').get('/files/index-import-1.css').reply(200, 'OK', {'content-type': 'text/css'});
+ nock('http://example.com/').get('/files/index-import-2.css').replyWithFile(200, mockDirname + '/index-import-2.css', {'content-type': 'text/css'});
+ nock('http://example.com/').get('/files/index-import-3.css').reply(200, 'OK', {'content-type': 'text/css'});
nock('http://example.com/').get('/files/index-image-1.png').reply(200, 'OK');
nock('http://example.com/').get('/files/index-image-2.png').reply(200, 'OK');
diff --git a/test/functional/binary-resources/images.test.js b/test/functional/binary-resources/images.test.js
new file mode 100644
index 00000000..05d07916
--- /dev/null
+++ b/test/functional/binary-resources/images.test.js
@@ -0,0 +1,69 @@
+import should from 'should';
+import '../../utils/assertions.js';
+import nock from 'nock';
+import fs from 'fs-extra';
+import cheerio from 'cheerio';
+import scrape from 'website-scraper';
+
+const testDirname = './test/functional/binary-resources/.tmp';
+const mockDirname = './test/functional/binary-resources/mocks';
+
+describe('Functional: images', () => {
+ const options = {
+ urls: [ 'http://example.com/' ],
+ directory: testDirname,
+ subdirectories: [
+ { directory: 'img', extensions: ['.jpg', '.png'] }
+ ],
+ sources: [
+ { selector: 'img', attr: 'src' }
+ ],
+ ignoreErrors: false
+ };
+
+ beforeEach(() => {
+ nock.cleanAll();
+ nock.disableNetConnect();
+ });
+
+ afterEach(() => {
+ nock.cleanAll();
+ nock.enableNetConnect();
+ fs.removeSync(testDirname);
+ });
+
+ beforeEach(() => {
+ // mock base urls
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
+
+ // mock sources for index.html
+ nock('http://example.com/').get('/test-image.png').replyWithFile(200, mockDirname + '/test-image.png', {'content-type': 'image/png'});
+ nock('http://example.com/').get('/test-image.jpg').replyWithFile(200, mockDirname + '/test-image.jpg', {'content-type': 'image/jpeg'});
+ });
+
+ it('should load images and save content correctly', async () => {
+ await scrape(options);
+
+ // should create directory and subdirectories
+ fs.existsSync(testDirname).should.be.eql(true);
+ fs.existsSync(testDirname + '/img').should.be.eql(true);
+
+ // should contain all sources found in index.html
+ fs.existsSync(testDirname + '/img/test-image.png').should.be.eql(true);
+ fs.existsSync(testDirname + '/img/test-image.jpg').should.be.eql(true);
+
+ // all sources in index.html should be replaced with local paths
+ let $ = cheerio.load(fs.readFileSync(testDirname + '/index.html').toString());
+ $('img.png').attr('src').should.be.eql('img/test-image.png');
+ $('img.jpg').attr('src').should.be.eql('img/test-image.jpg');
+
+ // content of downloaded images should equal original images
+ const originalPng = fs.readFileSync(mockDirname + '/test-image.png');
+ const originalJpg = fs.readFileSync(mockDirname + '/test-image.jpg');
+ const resultPng = fs.readFileSync(testDirname + '/img/test-image.png');
+ const resultJpg = fs.readFileSync(testDirname + '/img/test-image.jpg');
+
+ should(resultPng).be.eql(originalPng);
+ should(resultJpg).be.eql(originalJpg);
+ });
+});
diff --git a/test/functional/binary-resources/mocks/index.html b/test/functional/binary-resources/mocks/index.html
new file mode 100644
index 00000000..20f1cf2a
--- /dev/null
+++ b/test/functional/binary-resources/mocks/index.html
@@ -0,0 +1,11 @@
+
+
+
+
+ Index
+
+
+
+
+
+
\ No newline at end of file
diff --git a/test/functional/binary-resources/mocks/test-image.jpg b/test/functional/binary-resources/mocks/test-image.jpg
new file mode 100644
index 00000000..2fc54b46
Binary files /dev/null and b/test/functional/binary-resources/mocks/test-image.jpg differ
diff --git a/test/functional/binary-resources/mocks/test-image.png b/test/functional/binary-resources/mocks/test-image.png
new file mode 100644
index 00000000..9c9ee685
Binary files /dev/null and b/test/functional/binary-resources/mocks/test-image.png differ
diff --git a/test/functional/callbacks/callbacks.test.js b/test/functional/callbacks/callbacks.test.js
index 1e07e916..7f6b3279 100644
--- a/test/functional/callbacks/callbacks.test.js
+++ b/test/functional/callbacks/callbacks.test.js
@@ -6,7 +6,6 @@ import sinon from 'sinon';
import scrape from 'website-scraper';
const testDirname = './test/functional/callbacks/.tmp';
-const mockDirname = './test/functional/base/mocks';
describe('Functional: onResourceSaved and onResourceError callbacks in plugin', () => {
diff --git a/test/functional/circular-dependencies/circular-dependencies.test.js b/test/functional/circular-dependencies/circular-dependencies.test.js
index 428669e1..d5a5c229 100644
--- a/test/functional/circular-dependencies/circular-dependencies.test.js
+++ b/test/functional/circular-dependencies/circular-dependencies.test.js
@@ -34,10 +34,10 @@ describe('Functional circular dependencies', function() {
]
};
- nock('http://example.com/').get('/index.html').replyWithFile(200, mockDirname + '/index.html');
- nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html');
- nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css');
- nock('http://example.com/').get('/style2.css').replyWithFile(200, mockDirname + '/style2.css');
+ nock('http://example.com/').get('/index.html').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
+ nock('http://example.com/').get('/about.html').replyWithFile(200, mockDirname + '/about.html', {'content-type': 'text/html'});
+ nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'});
+ nock('http://example.com/').get('/style2.css').replyWithFile(200, mockDirname + '/style2.css', {'content-type': 'text/css'});
return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
diff --git a/test/functional/css-handling/css-handling.test.js b/test/functional/css-handling/css-handling.test.js
index afb3cd64..211b2b1f 100644
--- a/test/functional/css-handling/css-handling.test.js
+++ b/test/functional/css-handling/css-handling.test.js
@@ -21,11 +21,11 @@ describe('Functional: css handling', function() {
});
it('should correctly handle css files, style tags and style attributes and ignore css-like text inside common html tags', function() {
- nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
- nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css');
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
+ nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'});
- nock('http://example.com/').get('/style-import-1.css').reply(200, 'style-import-1.css');
- nock('http://example.com/').get('/style-import-2.css').reply(200, 'style-import-2.css');
+ nock('http://example.com/').get('/style-import-1.css').reply(200, 'style-import-1.css', {'content-type': 'text/css'});
+ nock('http://example.com/').get('/style-import-2.css').reply(200, 'style-import-2.css', {'content-type': 'text/css'});
nock('http://example.com/').get('/style-tag.png').reply(200, 'style-tag.png');
nock('http://example.com/').get('/style-attr.png').reply(200, 'style-attr.png');
nock('http://example.com/').get('/css-like-text-in-html.png').reply(200, 'css-like-text-in-html.png');
diff --git a/test/functional/encoding/hieroglyphs.test.js b/test/functional/encoding/hieroglyphs.test.js
new file mode 100644
index 00000000..3081df2b
--- /dev/null
+++ b/test/functional/encoding/hieroglyphs.test.js
@@ -0,0 +1,41 @@
+import '../../utils/assertions.js';
+import nock from 'nock';
+import fs from 'fs-extra';
+import scrape from 'website-scraper';
+
+const testDirname = './test/functional/encoding/.tmp';
+const mockDirname = './test/functional/encoding/mocks';
+
+describe('Functional: Korean characters are properly encoded/decoded', function() {
+ const options = {
+ urls: [
+ 'http://example.com/',
+ ],
+ directory: testDirname,
+ ignoreErrors: false
+ };
+
+ beforeEach(function() {
+ nock.cleanAll();
+ nock.disableNetConnect();
+ });
+
+ afterEach(function() {
+ nock.cleanAll();
+ nock.enableNetConnect();
+ fs.removeSync(testDirname);
+ });
+
+ beforeEach(() => {
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
+ });
+
+ it('should save the page in the same data as it was originally', () => {
+ return scrape(options).then(function(result) {
+ const scrapedIndex = fs.readFileSync(testDirname + '/index.html').toString();
+ scrapedIndex.should.be.containEql('저는 7년 동안 한국에서 살았어요.
');
+ scrapedIndex.should.be.containEql('Слава Україні!
');
+ scrapedIndex.should.be.containEql('加入网站
');
+ });
+ });
+});
diff --git a/test/functional/encoding/mocks/index.html b/test/functional/encoding/mocks/index.html
new file mode 100644
index 00000000..8d724cc2
--- /dev/null
+++ b/test/functional/encoding/mocks/index.html
@@ -0,0 +1,12 @@
+
+
+
+
+ Test
+
+
+ 저는 7년 동안 한국에서 살았어요.
+ Слава Україні!
+ 加入网站
+
+
diff --git a/test/functional/html-entities/html-entities.test.js b/test/functional/html-entities/html-entities.test.js
index 4087628b..cb682dd5 100644
--- a/test/functional/html-entities/html-entities.test.js
+++ b/test/functional/html-entities/html-entities.test.js
@@ -21,8 +21,8 @@ describe('Functional: html entities', function() {
});
it('should decode all html-entities found in html files and not encode entities from css file', function() {
- nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
- nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css');
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
+ nock('http://example.com/').get('/style.css').replyWithFile(200, mockDirname + '/style.css', {'content-type': 'text/css'});
// in index.html
// /fonts?family=Myriad&v=2 => /fonts?family=Myriad&v=2
diff --git a/test/functional/redirect/redirect.test.js b/test/functional/redirect/redirect.test.js
index 4a25dc63..a7e4f16f 100644
--- a/test/functional/redirect/redirect.test.js
+++ b/test/functional/redirect/redirect.test.js
@@ -23,16 +23,16 @@ describe('Functional redirects', function() {
});
it('should follow redirects and save resource once if it has different urls', function() {
- nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html');
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/index.html', {'content-type': 'text/html'});
// true page - ok
- nock('http://example.com/').get('/true-page.html').reply(200, '
true page 1');
+ nock('http://example.com/').get('/true-page.html').reply(200, '
true page 1', {'content-type': 'text/html'});
// duplicating page - redirect to true page
nock('http://example.com/').get('/duplicating-page.html').reply(302, '', {'Location': 'http://example.com/true-page.html'});
nock('http://example.com/').get('/true-page.html').reply(200, 'true page 2');
// duplicating site - redirect to duplicating page, then redirect to true page
nock('http://duplicating.another-site.com/').get('/').reply(302, '', {'Location': 'http://example.com/duplicating-page.html'});
nock('http://example.com/').get('/duplicating-page.html').reply(302, '', {'Location': 'http://example.com/true-page.html'});
- nock('http://example.com/').get('/true-page.html').reply(200, 'true page 3');
+ nock('http://example.com/').get('/true-page.html').reply(200, 'true page 3', {'content-type': 'text/html'});
const options = {
urls: [ 'http://example.com/' ],
@@ -79,11 +79,11 @@ describe('Functional redirects', function() {
]
};
- nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/relative-resources-index.html');
+ nock('http://example.com/').get('/').replyWithFile(200, mockDirname + '/relative-resources-index.html', {'content-type': 'text/html'});
nock('http://example.com/').get('/about').reply(301, '', {'Location': 'http://example.com/about/'});
nock('http://example.com/').get('/about/').replyWithFile(200, mockDirname + '/relative-resources-about.html', {'content-type': 'text/html'});
- nock('http://example.com/').get('/style.css').reply(200, 'style.css');
- nock('http://example.com/').get('/about/style.css').reply(200, 'about/style.css');
+ nock('http://example.com/').get('/style.css').reply(200, 'style.css', {'content-type': 'text/css'});
+ nock('http://example.com/').get('/about/style.css').reply(200, 'about/style.css', {'content-type': 'text/css'});
return scrape(options).then(function() {
fs.existsSync(testDirname + '/index.html').should.be.eql(true);
diff --git a/test/unit/scraper-init-test.js b/test/unit/scraper-init-test.js
index 9612e180..080c5976 100644
--- a/test/unit/scraper-init-test.js
+++ b/test/unit/scraper-init-test.js
@@ -121,7 +121,7 @@ describe('Scraper initialization', function () {
s.options.request.should.containEql({
throwHttpErrors: false,
- encoding: 'binary',
+ responseType: 'buffer',
decompress: true,
https: {
rejectUnauthorized: false
@@ -143,7 +143,7 @@ describe('Scraper initialization', function () {
s.options.request.should.eql({
throwHttpErrors: true,
- encoding: 'binary',
+ responseType: 'buffer',
decompress: true,
https: {
rejectUnauthorized: false
diff --git a/test/unit/scraper-test.js b/test/unit/scraper-test.js
index 87e566e7..87ee177c 100644
--- a/test/unit/scraper-test.js
+++ b/test/unit/scraper-test.js
@@ -103,7 +103,7 @@ describe('Scraper', () => {
rr.should.be.eql(r);
rr.getUrl().should.be.eql('http://example.com/a.png');
rr.getFilename().should.be.not.empty();
- rr.getText().should.be.eql('OK');
+ rr.getText().should.be.not.empty();
});
it('should return null if the urlFilter returns false', async () =>{
@@ -138,7 +138,7 @@ describe('Scraper', () => {
rr.should.be.eql(r);
rr.getUrl().should.be.eql('http://example.com');
rr.getFilename().should.be.not.empty();
- rr.getText().should.be.eql('OK');
+ rr.getText().should.be.not.empty();
});
});
@@ -160,7 +160,7 @@ describe('Scraper', () => {
rr.should.be.eql(r);
rr.getUrl().should.be.eql('http://example.com/a.png');
rr.getFilename().should.be.not.empty();
- rr.getText().should.be.eql('OK');
+ rr.getText().should.be.not.empty();
});
it('should request the resource if maxDepth is set and resource depth is less than maxDept', async () =>{
@@ -181,7 +181,7 @@ describe('Scraper', () => {
rr.should.be.eql(r);
rr.getUrl().should.be.eql('http://example.com/a.png');
rr.getFilename().should.be.not.empty();
- rr.getText().should.be.eql('OK');
+ rr.getText().should.be.not.empty();
});
it('should request the resource if maxDepth is set and resource depth is equal to maxDept', async () =>{
@@ -201,7 +201,7 @@ describe('Scraper', () => {
rr.should.be.eql(r);
rr.getUrl().should.be.eql('http://example.com/a.png');
rr.getFilename().should.be.not.empty();
- rr.getText().should.be.eql('OK');
+ rr.getText().should.be.not.empty();
});
it('should return null if maxDepth is set and resource depth is greater than maxDepth', async () =>{