diff --git a/src/mwoffliner.lib.ts b/src/mwoffliner.lib.ts index 782f7a8d..91e3a8f2 100644 --- a/src/mwoffliner.lib.ts +++ b/src/mwoffliner.lib.ts @@ -218,7 +218,7 @@ async function execute(argv: any) { RedisStore.setOptions(argv.redis || config.defaults.redisPath) await RedisStore.connect() const { articleDetailXId, filesToDownloadXPath, filesToRetryXPath, redirectsXId } = RedisStore - + await downloader.setBaseUrls(forceRender) // Output directory const outputDirectory = path.isAbsolute(_outputDirectory || '') ? _outputDirectory : path.join(process.cwd(), _outputDirectory || 'out') await mkdirPromise(outputDirectory) diff --git a/src/sanitize-argument.ts b/src/sanitize-argument.ts index 0b2b49d1..41d3cef4 100644 --- a/src/sanitize-argument.ts +++ b/src/sanitize-argument.ts @@ -11,6 +11,7 @@ import { isValidEmail } from './util/index.js' import * as path from 'path' import { fileURLToPath } from 'url' import { parameterDescriptions } from './parameterList.js' +import { RENDERERS_LIST } from './util/const.js' const __filename = fileURLToPath(import.meta.url) const __dirname = path.dirname(__filename) @@ -192,11 +193,10 @@ export function sanitize_customFlavour(customFlavour: string): string { } export function sanitize_forceRender(renderName: string): string { - const renderNames = ['VisualEditor', 'WikimediaDesktop', 'WikimediaMobile'] const checkRenderName = (arr: string[], val: string) => { return arr.some((arrVal) => val === arrVal) } - if (checkRenderName(renderNames, renderName)) { + if (checkRenderName(RENDERERS_LIST, renderName)) { return renderName } throw new Error(`Invalid render name: ${renderName}`) diff --git a/src/util/const.ts b/src/util/const.ts index f7bbb515..6c511f56 100644 --- a/src/util/const.ts +++ b/src/util/const.ts @@ -20,3 +20,4 @@ export const RULE_TO_REDIRECT = /window\.top !== window\.self/ export const WEBP_HANDLER_URL = 'https://gist.githubusercontent.com/rgaudin/60bb9cc6f187add506584258028b8ee1/raw/9d575b8e25d67eed2a9c9a91d3e053a0062d2fc7/web-handler.js' export const MAX_FILE_DOWNLOAD_RETRIES = 5 export const BLACKLISTED_NS = ['Story'] // 'Story' Wikipedia namespace is content, but not indgestable by Parsoid https://github.com/openzim/mwoffliner/issues/1853 +export const RENDERERS_LIST = ['WikimediaDesktop', 'VisualEditor'] diff --git a/test/e2e/en.e2e.test.ts b/test/e2e/en.e2e.test.ts new file mode 100644 index 00000000..19f66928 --- /dev/null +++ b/test/e2e/en.e2e.test.ts @@ -0,0 +1,45 @@ +import { testAllRenders } from '../testAllRenders.js' +import domino from 'domino' +import { zimdump } from '../util.js' +import 'dotenv/config.js' +import { jest } from '@jest/globals' +import rimraf from 'rimraf' + +jest.setTimeout(60000) + +// Check the integrity of img elements between zim file and article html taken from it +const verifyImgElements = (imgFilesArr, imgElements) => { + for (const img of imgElements) { + for (const imgFile of imgFilesArr) { + if (img.getAttribute('src').includes(imgFile)) { + return true + } + } + } + return false +} + +const mwUrl = 'https://en.wikipedia.org' +const articleList = 'User:Kelson/MWoffliner_CI_reference' +const format = '' + +await testAllRenders(mwUrl, articleList, format, async (outFiles) => { + const articleFromDump = await zimdump(`show --url A/${articleList} ${outFiles[0].outFile}`) + describe('e2e test for en.wikipedia.org', () => { + const articleDoc = domino.createDocument(articleFromDump) + test(`test article header for ${outFiles[0]?.renderer} renderer`, async () => { + expect(articleDoc.querySelector('h1.article-header')).toBeTruthy() + }) + test(`test article image integrity for ${outFiles[0]?.renderer} renderer`, async () => { + const mediaFiles = await zimdump(`list --ns I ${outFiles[0].outFile}`) + const mediaFilesArr = mediaFiles.split('\n') + const imgFilesArr = mediaFilesArr.filter((elem) => elem.endsWith('pdf') || elem.endsWith('png') || elem.endsWith('jpg')) + const imgElements = Array.from(articleDoc.querySelectorAll('img')) + expect(verifyImgElements(imgFilesArr, imgElements)).toBe(true) + }) + + afterAll(() => { + rimraf.sync(`./${outFiles[0].testId}`) + }) + }) +}) diff --git a/test/testAllRenders.ts b/test/testAllRenders.ts new file mode 100644 index 00000000..16c7e330 --- /dev/null +++ b/test/testAllRenders.ts @@ -0,0 +1,59 @@ +import * as logger from '../src/Logger.js' +import * as mwoffliner from '../src/mwoffliner.lib.js' +import { execa } from 'execa' +import { RENDERERS_LIST } from '../src/util/const.js' +import { zimcheckAvailable, zimdumpAvailable } from './util.js' + +/* + This is the template for e2e tests of different wikis + 1. Verify zimcheck and zimdump availability and caches result + 2. Gets output file and checks its integrity + 3. Returns output file per renderer in the callback function +*/ + +let zimToolsChecked = false +async function checkZimTools() { + if (zimToolsChecked) { + return + } + + const zimcheckIsAvailable = await zimcheckAvailable() + const zimdumpIsAvailable = await zimdumpAvailable() + + if (!zimcheckIsAvailable || !zimdumpIsAvailable) { + const missingTool = !zimcheckIsAvailable ? 'Zimcheck' : 'Zimdump' + logger.error(`${missingTool} not installed, exiting test`) + process.exit(1) + } + + zimToolsChecked = true +} + +async function getOutFiles(renderName: string, testId: string, articleList: string, mwUrl: string, format?: string | string[]): Promise { + const parameters = { + mwUrl, + adminEmail: 'test@kiwix.org', + outputDirectory: testId, + redis: process.env.REDIS, + articleList, + forceRender: renderName, + format, + } + + await execa('redis-cli flushall', { shell: true }) + const outFiles = await mwoffliner.execute(parameters) + + return outFiles +} + +export async function testAllRenders(mwUrl: string, articleList: string, format: string | string[], callback) { + await checkZimTools() + for (const renderer of RENDERERS_LIST) { + const now = new Date() + const testId = `mwo-test-${+now}` + const outFiles = await getOutFiles(renderer, testId, articleList, mwUrl, format) + outFiles[0].testId = testId + outFiles[0].renderer = renderer + await callback(outFiles) + } +} diff --git a/test/unit/saveArticles.test.ts b/test/unit/saveArticles.test.ts index 525f80b0..d1644e4b 100644 --- a/test/unit/saveArticles.test.ts +++ b/test/unit/saveArticles.test.ts @@ -10,6 +10,7 @@ import { jest } from '@jest/globals' import { getArticleUrl } from '../../src/util/saveArticles.js' import { WikimediaDesktopRenderer } from '../../src/renderers/wikimedia-desktop.renderer.js' import { VisualEditorRenderer } from '../../src/renderers/visual-editor.renderer.js' +import { RENDERERS_LIST } from '../../src/util/const.js' jest.setTimeout(40000) @@ -79,69 +80,49 @@ describe('saveArticles', () => { expect(articleDoc.querySelector('h1.article-header')).toBeTruthy() }) - test('Check nodet article for en.wikipedia.org using Visual Editor renderer', async () => { - const visualEditorRenderer = new VisualEditorRenderer() - const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikipedia.org', format: 'nodet' }) // en wikipedia - await downloader.setBaseUrls('VisualEditor') - const articleId = 'Canada' - const articleUrl = getArticleUrl(downloader, dump, articleId) - const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId]) - const articlesDetail = mwRetToArticleDetail(_articleDetailsRet) - const { articleDetailXId } = RedisStore - const articleDetail = { title: articleId, timestamp: '2023-09-10T17:36:04Z' } - const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title) - articleDetailXId.setMany(articlesDetail) - const result = await downloader.getArticle( - downloader.webp, - _moduleDependencies, - articleId, - articleDetailXId, - visualEditorRenderer, - articleUrl, - dump, - articleDetail, - dump.isMainPage(articleId), - ) - - const articleDoc = domino.createDocument(result[0].html) - - const sections = Array.from(articleDoc.querySelectorAll('section')) - const leadSection = sections[0] - expect(sections.length).toEqual(1) - expect(leadSection.getAttribute('data-mw-section-id')).toEqual('0') - }) - - test('Check nodet article for en.wikipedia.org using Wikimedia Desktop renderer', async () => { - const wikimediaDesktopRenderer = new WikimediaDesktopRenderer() - const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikipedia.org', format: 'nodet' }) // en wikipedia - await downloader.setBaseUrls('WikimediaDesktop') - const articleId = 'London' - const articleUrl = getArticleUrl(downloader, dump, articleId) - const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId]) - const articlesDetail = mwRetToArticleDetail(_articleDetailsRet) - const { articleDetailXId } = RedisStore - const articleDetail = { title: articleId } - const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title) - articleDetailXId.setMany(articlesDetail) - const result = await downloader.getArticle( - downloader.webp, - _moduleDependencies, - articleId, - articleDetailXId, - wikimediaDesktopRenderer, - articleUrl, - dump, - articleDetail, - dump.isMainPage(articleId), - ) + for (const renderer of RENDERERS_LIST) { + test(`Check nodet article for en.wikipedia.org using ${renderer} renderer`, async () => { + let rendererInstance + switch (renderer) { + case 'VisualEditor': + rendererInstance = new VisualEditorRenderer() + break + case 'WikimediaDesktop': + rendererInstance = new WikimediaDesktopRenderer() + break + default: + throw new Error(`Unknown renderer: ${renderer}`) + } + const { downloader, dump } = await setupScrapeClasses({ mwUrl: 'https://en.wikipedia.org', format: 'nodet' }) // en wikipedia + await downloader.setBaseUrls(renderer) + const articleId = 'Canada' + const articleUrl = getArticleUrl(downloader, dump, articleId) + const _articleDetailsRet = await downloader.getArticleDetailsIds([articleId]) + const articlesDetail = mwRetToArticleDetail(_articleDetailsRet) + const { articleDetailXId } = RedisStore + const articleDetail = { title: articleId, timestamp: '2023-09-10T17:36:04Z' } + const _moduleDependencies = await downloader.getModuleDependencies(articleDetail.title) + articleDetailXId.setMany(articlesDetail) + const result = await downloader.getArticle( + downloader.webp, + _moduleDependencies, + articleId, + articleDetailXId, + rendererInstance, + articleUrl, + dump, + articleDetail, + dump.isMainPage(articleId), + ) - const articleDoc = domino.createDocument(result[0].html) + const articleDoc = domino.createDocument(result[0].html) - const sections = Array.from(articleDoc.querySelectorAll('section')) - const leadSection = sections[0] - expect(sections.length).toEqual(1) - expect(leadSection.getAttribute('data-mw-section-id')).toEqual('0') - }) + const sections = Array.from(articleDoc.querySelectorAll('section')) + const leadSection = sections[0] + expect(sections.length).toEqual(1) + expect(leadSection.getAttribute('data-mw-section-id')).toEqual('0') + }) + } test('Load main page and check that it is without header', async () => { const wikimediaDesktopRenderer = new WikimediaDesktopRenderer()