From d2768ef151780908c1d9a5dc0ee5a55ae61f21f6 Mon Sep 17 00:00:00 2001 From: Emmanuel Engelhart Date: Sat, 30 Sep 2023 17:13:23 +0200 Subject: [PATCH 1/2] Don't mirror 'Story' namespace --- src/MediaWiki.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/MediaWiki.ts b/src/MediaWiki.ts index 9f4951bb..7b262665 100644 --- a/src/MediaWiki.ts +++ b/src/MediaWiki.ts @@ -227,18 +227,22 @@ class MediaWiki { const num = entry.id const allowedSubpages = 'subpages' in entry const isContent = type === 'namespaces' ? !!(entry.content || util.contains(addNamespaces, num)) : !!(entry.content !== undefined || util.contains(addNamespaces, num)) + const isBlacklisted = name === 'Story' // 'Story' Wikipedia namespace is content, but not indgestable by Parsoid https://github.com/openzim/mwoffliner/issues/1853 const canonical = entry.canonical ? entry.canonical : '' const details = { num, allowedSubpages, isContent } + /* Namespaces in local language */ this.namespaces[util.lcFirst(name)] = details this.namespaces[util.ucFirst(name)] = details + /* Namespaces in English (if available) */ if (canonical) { this.namespaces[util.lcFirst(canonical)] = details this.namespaces[util.ucFirst(canonical)] = details } + /* Is content to mirror */ - if (isContent) { + if (isContent && !isBlacklisted) { this.namespacesToMirror.push(name) } }) From 4b29977d5160324b8031e56a5f3a83e873366304 Mon Sep 17 00:00:00 2001 From: Vadim Kovalenko Date: Tue, 3 Oct 2023 16:12:06 +0300 Subject: [PATCH 2/2] Move blacklisted NS to consts, add unit test --- src/MediaWiki.ts | 4 ++-- src/util/const.ts | 1 + test/unit/mwApi.test.ts | 48 ++++++++++++++++++++++++++++++++--------- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/MediaWiki.ts b/src/MediaWiki.ts index 7b262665..65ca1055 100644 --- a/src/MediaWiki.ts +++ b/src/MediaWiki.ts @@ -12,6 +12,7 @@ import ApiURLDirector from './util/builders/url/api.director.js' import DesktopURLDirector from './util/builders/url/desktop.director.js' import VisualEditorURLDirector from './util/builders/url/visual-editor.director.js' import { checkApiAvailability } from './util/mw-api.js' +import { BLACKLISTED_NS } from './util/const.js' export interface QueryOpts { action: string @@ -34,7 +35,6 @@ class MediaWiki { } public metaData: MWMetaData - public _base: string public baseUrl: URL public getCategories: boolean public namespaces: MWNamespaces = {} @@ -227,7 +227,7 @@ class MediaWiki { const num = entry.id const allowedSubpages = 'subpages' in entry const isContent = type === 'namespaces' ? !!(entry.content || util.contains(addNamespaces, num)) : !!(entry.content !== undefined || util.contains(addNamespaces, num)) - const isBlacklisted = name === 'Story' // 'Story' Wikipedia namespace is content, but not indgestable by Parsoid https://github.com/openzim/mwoffliner/issues/1853 + const isBlacklisted = BLACKLISTED_NS.includes(name) const canonical = entry.canonical ? entry.canonical : '' const details = { num, allowedSubpages, isContent } diff --git a/src/util/const.ts b/src/util/const.ts index 4a781482..f7bbb515 100644 --- a/src/util/const.ts +++ b/src/util/const.ts @@ -19,3 +19,4 @@ export const LOAD_PHP = /script.src = ".*load\.php.*";/ export const RULE_TO_REDIRECT = /window\.top !== window\.self/ export const WEBP_HANDLER_URL = 'https://gist.githubusercontent.com/rgaudin/60bb9cc6f187add506584258028b8ee1/raw/9d575b8e25d67eed2a9c9a91d3e053a0062d2fc7/web-handler.js' export const MAX_FILE_DOWNLOAD_RETRIES = 5 +export const BLACKLISTED_NS = ['Story'] // 'Story' Wikipedia namespace is content, but not indgestable by Parsoid https://github.com/openzim/mwoffliner/issues/1853 diff --git a/test/unit/mwApi.test.ts b/test/unit/mwApi.test.ts index f7a92ec3..9b73fadd 100644 --- a/test/unit/mwApi.test.ts +++ b/test/unit/mwApi.test.ts @@ -9,10 +9,22 @@ import { jest } from '@jest/globals' jest.setTimeout(10000) -describe('mwApi', () => { - beforeAll(startRedis) - afterAll(stopRedis) +beforeAll(async () => { + MediaWiki.reset() + await startRedis() +}) +afterAll(stopRedis) + +const initMW = async (downloader: Downloader) => { + await MediaWiki.getMwMetaData(downloader) + await MediaWiki.hasCoordinates(downloader) + await MediaWiki.hasWikimediaDesktopRestApi() + await MediaWiki.hasVisualEditorApi() + await MediaWiki.getNamespaces([], downloader) +} + +describe('mwApi', () => { let downloader: Downloader beforeEach(async () => { @@ -20,15 +32,9 @@ describe('mwApi', () => { MediaWiki.base = 'https://en.wikipedia.org' MediaWiki.getCategories = true - downloader = new Downloader({ uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: false, optimisationCacheUrl: '' }) - await MediaWiki.getMwMetaData(downloader) - await MediaWiki.hasCoordinates(downloader) - await MediaWiki.hasWikimediaDesktopRestApi() - await MediaWiki.hasVisualEditorApi() - - await MediaWiki.getNamespaces([], downloader) + await initMW(downloader) }) test('MWApi Article Ids', async () => { @@ -116,3 +122,25 @@ describe('mwApi', () => { expect(interWikiTitle).toBeNull() }) }) + +describe('Test blacklisted NSs', () => { + let downloader: Downloader + + beforeEach(async () => { + await RedisStore.articleDetailXId.flush() + + MediaWiki.base = 'https://id.wikipedia.org' + MediaWiki.getCategories = true + + downloader = new Downloader({ uaString: `${config.userAgent} (contact@kiwix.org)`, speed: 1, reqTimeout: 1000 * 60, webp: false, optimisationCacheUrl: '' }) + + await initMW(downloader) + }) + + test('Prevent blacklisted namespaces to mirroring', async () => { + const aIds = ['Story:Satelit_Oberon', 'London'] + await getArticleIds(downloader, 'Main_Page', aIds) + + expect(MediaWiki.namespacesToMirror).not.toContain('Story') + }) +})