diff --git a/src/Downloader.ts b/src/Downloader.ts index 80dc95d3..73e5cf6a 100644 --- a/src/Downloader.ts +++ b/src/Downloader.ts @@ -166,19 +166,39 @@ class Downloader { } } - public async setBaseUrls() { - //* Objects order in array matters! - this.baseUrl = basicURLDirector.buildDownloaderBaseUrl([ - { condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href }, - { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href }, - ]) - - //* Objects order in array matters! - this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl([ - { condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href }, - { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href }, - ]) - + public async setBaseUrls(forceRender = null) { + if (!forceRender) { + //* Objects order in array matters! + this.baseUrl = basicURLDirector.buildDownloaderBaseUrl([ + { condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href }, + { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href }, + ]) + + //* Objects order in array matters! + this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl([ + { condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href }, + { condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href }, + ]) + } else { + switch (forceRender) { + case 'WikimediaDesktop': + if (MediaWiki.hasWikimediaDesktopRestApi()) { + this.baseUrl = MediaWiki.desktopRestApiUrl.href + this.baseUrlForMainPage = MediaWiki.desktopRestApiUrl.href + break + } + break + case 'VisualEditor': + if (MediaWiki.hasVisualEditorApi()) { + this.baseUrl = MediaWiki.visualEditorApiUrl.href + this.baseUrlForMainPage = MediaWiki.visualEditorApiUrl.href + break + } + break + default: + throw new Error('Unable to find specific API end-point to retrieve article HTML') + } + } logger.log('Base Url: ', this.baseUrl) logger.log('Base Url for Main Page: ', this.baseUrlForMainPage) @@ -625,7 +645,7 @@ class Downloader { const articleData = await this.getJSON(articleApiUrl) if (articleData.error) { - const errorMessage = `Unable to retrieve js/css dependencies for article '${title}': ${articleData.error.code}` + const errorMessage = `Unable to retrieve js/css dependencies for article '${this}': ${articleData.error.code}` logger.error(errorMessage) /* If article is missing (for example because it just has been deleted) */ diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index e6e0a19a..782f7a8d 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -99,6 +99,7 @@ async function execute(argv: any) { customZimFavicon, optimisationCacheUrl, customFlavour, + forceRender, } = argv let { articleList, articleListToIgnore } = argv @@ -212,8 +213,7 @@ async function execute(argv: any) { await MediaWiki.hasCoordinates(downloader) await MediaWiki.hasWikimediaDesktopRestApi() await MediaWiki.hasVisualEditorApi() - - await downloader.setBaseUrls() + await downloader.setBaseUrls(forceRender) RedisStore.setOptions(argv.redis || config.defaults.redisPath) await RedisStore.connect() @@ -420,7 +420,7 @@ async function execute(argv: any) { logger.log('Getting articles') stime = Date.now() - const { jsModuleDependencies, cssModuleDependencies } = await saveArticles(zimCreator, downloader, dump) + const { jsModuleDependencies, cssModuleDependencies } = await saveArticles(zimCreator, downloader, dump, forceRender) logger.log(`Fetching Articles finished in ${(Date.now() - stime) / 1000} seconds`) logger.log(`Found [${jsModuleDependencies.size}] js module dependencies`) @@ -607,32 +607,41 @@ async function execute(argv: any) { return mainPage ? createMainPageRedirect() : createMainPage() } + async function fetchArticleDetail(articleId: string) { + return await articleDetailXId.get(articleId) + } + + async function updateArticleThumbnail(articleDetail: any, articleId: string) { + const imageUrl = articleDetail.thumbnail + if (!imageUrl) return + + const { width: oldWidth } = getSizeFromUrl(imageUrl.source) + const suitableResUrl = imageUrl.source.replace(`/${oldWidth}px-`, '/500px-').replace(`-${oldWidth}px-`, '-500px-') + const { mult, width } = getSizeFromUrl(suitableResUrl) + const path = getMediaBase(suitableResUrl, false) + + articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true), 'I') + + await Promise.all([filesToDownloadXPath.set(path, { url: urlHelper.serializeUrl(suitableResUrl), mult, width } as FileDetail), articleDetailXId.set(articleId, articleDetail)]) + } + async function getThumbnailsData(): Promise { if (customMainPage || !articleList || articleListLines.length <= MIN_IMAGE_THRESHOLD_ARTICLELIST_PAGE) return + logger.log('Updating article thumbnails for articles') + let articleIndex = 0 let articlesWithImages = 0 while (articleIndex < articleListLines.length && articlesWithImages <= 100) { const articleId = articleListLines[articleIndex] articleIndex++ + try { - const articleDetail = await articleDetailXId.get(articleId) + const articleDetail = await fetchArticleDetail(articleId) if (!articleDetail) continue - const imageUrl = articleDetail.thumbnail - if (!imageUrl) continue - - const { width: oldWidth } = getSizeFromUrl(imageUrl.source) - const suitableResUrl = imageUrl.source.replace(`/${oldWidth}px-`, '/500px-').replace(`-${oldWidth}px-`, '-500px-') - const { mult, width } = getSizeFromUrl(suitableResUrl) - const path = getMediaBase(suitableResUrl, false) - articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true), 'I') - - await Promise.all([ - filesToDownloadXPath.set(path, { url: urlHelper.serializeUrl(suitableResUrl), mult, width } as FileDetail), - articleDetailXId.set(articleId, articleDetail), - ]) + await updateArticleThumbnail(articleDetail, articleId) articlesWithImages++ } catch (err) { logger.warn(`Failed to parse thumbnail for [${articleId}], skipping...`) diff --git a/src/parameterList.ts b/src/parameterList.ts index 230d6a22..6f31e797 100644 --- a/src/parameterList.ts +++ b/src/parameterList.ts @@ -39,6 +39,8 @@ export const parameterDescriptions = { osTmpDir: 'Override default operating system temporary directory path environment variable', customFlavour: 'A custom processor that can filter and process articles (see extensions/*.js)', optimisationCacheUrl: 'S3 url, including credentials and bucket name', + forceRender: + 'Force the usage of a specific API end-point/render, automatically chosen otherwise. Accepted values: [ VisualEditor, WikimediaDesktop. WikimediaMobile ]. More details at https://github.com/openzim/mwoffliner/wiki/API-end-points', } // TODO: Add an interface based on the object above diff --git a/src/renderers/abstract.renderer.ts b/src/renderers/abstract.renderer.ts index d85d879c..28981a6a 100644 --- a/src/renderers/abstract.renderer.ts +++ b/src/renderers/abstract.renderer.ts @@ -576,9 +576,7 @@ export abstract class Renderer { return false } - private applyOtherTreatments(parsoidDoc: DominoElement, dump: Dump) { - const filtersConfig = config.filters - + private clearLinkAndInputTags(parsoidDoc: DominoElement, filtersConfig: any, dump: Dump) { /* Don't need and tags */ const nodesToDelete: Array<{ class?: string; tag?: string; filter?: (n: any) => boolean }> = [{ tag: 'link' }, { tag: 'input' }] @@ -646,6 +644,42 @@ export abstract class Renderer { } } } + } + + private clearNodes(parsoidDoc: DominoElement, filtersConfig: any) { + const allNodes: DominoElement[] = Array.from(parsoidDoc.getElementsByTagName('*')) + for (const node of allNodes) { + node.removeAttribute('data-parsoid') + node.removeAttribute('typeof') + node.removeAttribute('about') + node.removeAttribute('data-mw') + + if (node.getAttribute('rel') && node.getAttribute('rel').substr(0, 3) === 'mw:') { + node.removeAttribute('rel') + } else if (node.getAttribute('img')) { + /* Remove a few images Parsoid attributes */ + node.removeAttribute('data-file-width') + node.removeAttribute('data-file-height') + node.removeAttribute('data-file-type') + } + + /* Remove a few css calls */ + filtersConfig.cssClassCallsBlackList.map((classname: string) => { + if (node.getAttribute('class')) { + node.setAttribute('class', node.getAttribute('class').replace(classname, '')) + } + }) + } + + const kartographerMaplinkNodes = Array.from(parsoidDoc.querySelectorAll('.mw-kartographer-maplink')).filter((n) => !!n.textContent) + for (const node of kartographerMaplinkNodes) { + node.textContent = '🌍' + } + } + + private applyOtherTreatments(parsoidDoc: DominoElement, dump: Dump) { + const filtersConfig = config.filters + this.clearLinkAndInputTags(parsoidDoc, filtersConfig, dump) /* Go through all reference calls */ const spans: DominoElement[] = Array.from(parsoidDoc.getElementsByTagName('span')) @@ -682,53 +716,22 @@ export abstract class Renderer { /* Remove empty paragraphs */ // TODO: Refactor this option to work with page/html and page/mobile-html output. See issues/1866 if (!dump.opts.keepEmptyParagraphs) { - if (!dump.opts.keepEmptyParagraphs) { - // Mobile view === details - // Desktop view === section - const sections: DominoElement[] = Array.from(parsoidDoc.querySelectorAll('details, section')) - for (const section of sections) { - if ( - section.children.length === - Array.from(section.children).filter((child: DominoElement) => { - return child.matches('summary') - }).length - ) { - DU.deleteNode(section) - } + // Mobile view === details + // Desktop view === section + const sections: DominoElement[] = Array.from(parsoidDoc.querySelectorAll('details, section')) + for (const section of sections) { + if ( + section.children.length === + Array.from(section.children).filter((child: DominoElement) => { + return child.matches('summary') + }).length + ) { + DU.deleteNode(section) } } } - /* Clean the DOM of all uncessary code */ - const allNodes: DominoElement[] = Array.from(parsoidDoc.getElementsByTagName('*')) - for (const node of allNodes) { - node.removeAttribute('data-parsoid') - node.removeAttribute('typeof') - node.removeAttribute('about') - node.removeAttribute('data-mw') - - if (node.getAttribute('rel') && node.getAttribute('rel').substr(0, 3) === 'mw:') { - node.removeAttribute('rel') - } else if (node.getAttribute('img')) { - /* Remove a few images Parsoid attributes */ - node.removeAttribute('data-file-width') - node.removeAttribute('data-file-height') - node.removeAttribute('data-file-type') - } - - /* Remove a few css calls */ - filtersConfig.cssClassCallsBlackList.map((classname: string) => { - if (node.getAttribute('class')) { - node.setAttribute('class', node.getAttribute('class').replace(classname, '')) - } - }) - } - - const kartographerMaplinkNodes = Array.from(parsoidDoc.querySelectorAll('.mw-kartographer-maplink')).filter((n) => !!n.textContent) - for (const node of kartographerMaplinkNodes) { - node.textContent = '🌍' - } - + this.clearNodes(parsoidDoc, filtersConfig) return parsoidDoc } diff --git a/src/sanitize-argument.ts b/src/sanitize-argument.ts index b4504a3e..0b2b49d1 100644 --- a/src/sanitize-argument.ts +++ b/src/sanitize-argument.ts @@ -18,7 +18,19 @@ const parametersWithArrayType = ['format'] export async function sanitize_all(argv: any) { // extracting all arguments - const { articleList, addNamespaces, speed: _speed, adminEmail, mwUrl, customZimFavicon, optimisationCacheUrl, verbose, customZimLongDescription, customZimDescription } = argv + const { + articleList, + addNamespaces, + speed: _speed, + adminEmail, + mwUrl, + customZimFavicon, + optimisationCacheUrl, + verbose, + customZimLongDescription, + customZimDescription, + forceRender, + } = argv sanitizeDoubleUsedParameters(argv) @@ -73,6 +85,11 @@ export async function sanitize_all(argv: any) { // sanitizing adminEmail sanitize_adminEmail(adminEmail) + // sanitizing renderer + if (forceRender) { + sanitize_forceRender(forceRender) + } + // Redis client sanitization // created a redis client and then closed it. sanitize_redis(argv) @@ -173,3 +190,14 @@ export function sanitize_customFlavour(customFlavour: string): string { }) || null ) } + +export function sanitize_forceRender(renderName: string): string { + const renderNames = ['VisualEditor', 'WikimediaDesktop', 'WikimediaMobile'] + const checkRenderName = (arr: string[], val: string) => { + return arr.some((arrVal) => val === arrVal) + } + if (checkRenderName(renderNames, renderName)) { + return renderName + } + throw new Error(`Invalid render name: ${renderName}`) +} diff --git a/src/util/saveArticles.ts b/src/util/saveArticles.ts index a8ee0e7f..cf60d2bb 100644 --- a/src/util/saveArticles.ts +++ b/src/util/saveArticles.ts @@ -249,7 +249,7 @@ export function getArticleUrl(downloader: Downloader, dump: Dump, articleId: str /* * Fetch Articles */ -export async function saveArticles(zimCreator: ZimCreator, downloader: Downloader, dump: Dump) { +export async function saveArticles(zimCreator: ZimCreator, downloader: Downloader, dump: Dump, forceRender = null) { const jsModuleDependencies = new Set() const cssModuleDependencies = new Set() let jsConfigVars = '' @@ -258,9 +258,19 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade const articlesTotal = await articleDetailXId.len() const rendererBuilder = new RendererBuilder() - const rendererBuilderOptions: RendererBuilderOptions = { - renderType: 'auto', + + let rendererBuilderOptions: RendererBuilderOptions + if (forceRender) { + rendererBuilderOptions = { + renderType: 'specific', + renderName: forceRender, + } + } else { + rendererBuilderOptions = { + renderType: 'auto', + } } + const mainPageRenderer = await rendererBuilder.createRenderer(rendererBuilderOptions) // TODO: article renderer will be switched to the mobile mode later const articlesRenderer = await rendererBuilder.createRenderer(rendererBuilderOptions) diff --git a/test/e2e/forceRender.test.ts b/test/e2e/forceRender.test.ts new file mode 100644 index 00000000..edac673b --- /dev/null +++ b/test/e2e/forceRender.test.ts @@ -0,0 +1,55 @@ +import * as mwoffliner from '../../src/mwoffliner.lib.js' +import { execa } from 'execa' +import rimraf from 'rimraf' +import { jest } from '@jest/globals' +import { zimcheckAvailable, zimcheck } from '../util.js' + +jest.setTimeout(200000) + +describe('forceRender', () => { + const now = new Date() + const testId = `mwo-test-${+now}` + + const parameters = { + mwUrl: 'https://bm.wikipedia.org', + adminEmail: 'test@kiwix.org', + outputDirectory: testId, + redis: process.env.REDIS, + format: ['nopic'], + articleList: 'France', + } + + beforeAll(async () => { + await execa('redis-cli flushall', { shell: true }) + }) + + test('Scrape article from bm.wikipedia.org using WikimediaDesktop render', async () => { + const forceRender = 'WikimediaDesktop' + const outFiles = await mwoffliner.execute({ ...parameters, forceRender }) + + if (await zimcheckAvailable()) { + await expect(zimcheck(outFiles[0].outFile)).resolves.not.toThrowError() + } else { + console.log('Zimcheck not installed, skipping test') + } + + rimraf.sync(`./${testId}`) + const redisScan = await execa('redis-cli --scan', { shell: true }) + // Redis has been cleared + expect(redisScan.stdout).toEqual('') + }) + + test('Scrape article from bm.wikipedia.org should throw error when using VisualEditor render', async () => { + const forceRender = 'VisualEditor' + expect(async () => { + await mwoffliner.execute({ ...parameters, forceRender }) + }).rejects.toThrowError() + }) + + test('Scrape article from bm.wikipedia.org should throw error when using wrong render', async () => { + const forceRender = 'unknownRenderName' + expect(async () => { + await mwoffliner.execute({ ...parameters, forceRender }) + }).rejects.toThrowError() + }) +})