Skip to content

Commit

Permalink
Merge pull request #1901 from openzim/1898-introduce-forceRender-param
Browse files Browse the repository at this point in the history
Introduce forceRender param
  • Loading branch information
kelson42 authored Sep 8, 2023
2 parents 7174801 + 9237475 commit 2c5553b
Show file tree
Hide file tree
Showing 7 changed files with 208 additions and 81 deletions.
48 changes: 34 additions & 14 deletions src/Downloader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,19 +166,39 @@ class Downloader {
}
}

public async setBaseUrls() {
//* Objects order in array matters!
this.baseUrl = basicURLDirector.buildDownloaderBaseUrl([
{ condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href },
{ condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href },
])

//* Objects order in array matters!
this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl([
{ condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href },
{ condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href },
])

public async setBaseUrls(forceRender = null) {
if (!forceRender) {
//* Objects order in array matters!
this.baseUrl = basicURLDirector.buildDownloaderBaseUrl([
{ condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href },
{ condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href },
])

//* Objects order in array matters!
this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl([
{ condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href },
{ condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href },
])
} else {
switch (forceRender) {
case 'WikimediaDesktop':
if (MediaWiki.hasWikimediaDesktopRestApi()) {
this.baseUrl = MediaWiki.desktopRestApiUrl.href
this.baseUrlForMainPage = MediaWiki.desktopRestApiUrl.href
break
}
break
case 'VisualEditor':
if (MediaWiki.hasVisualEditorApi()) {
this.baseUrl = MediaWiki.visualEditorApiUrl.href
this.baseUrlForMainPage = MediaWiki.visualEditorApiUrl.href
break
}
break
default:
throw new Error('Unable to find specific API end-point to retrieve article HTML')
}
}
logger.log('Base Url: ', this.baseUrl)
logger.log('Base Url for Main Page: ', this.baseUrlForMainPage)

Expand Down Expand Up @@ -625,7 +645,7 @@ class Downloader {
const articleData = await this.getJSON<any>(articleApiUrl)

if (articleData.error) {
const errorMessage = `Unable to retrieve js/css dependencies for article '${title}': ${articleData.error.code}`
const errorMessage = `Unable to retrieve js/css dependencies for article '${this}': ${articleData.error.code}`
logger.error(errorMessage)

/* If article is missing (for example because it just has been deleted) */
Expand Down
43 changes: 26 additions & 17 deletions src/mwoffliner.lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ async function execute(argv: any) {
customZimFavicon,
optimisationCacheUrl,
customFlavour,
forceRender,
} = argv

let { articleList, articleListToIgnore } = argv
Expand Down Expand Up @@ -212,8 +213,7 @@ async function execute(argv: any) {
await MediaWiki.hasCoordinates(downloader)
await MediaWiki.hasWikimediaDesktopRestApi()
await MediaWiki.hasVisualEditorApi()

await downloader.setBaseUrls()
await downloader.setBaseUrls(forceRender)

RedisStore.setOptions(argv.redis || config.defaults.redisPath)
await RedisStore.connect()
Expand Down Expand Up @@ -420,7 +420,7 @@ async function execute(argv: any) {

logger.log('Getting articles')
stime = Date.now()
const { jsModuleDependencies, cssModuleDependencies } = await saveArticles(zimCreator, downloader, dump)
const { jsModuleDependencies, cssModuleDependencies } = await saveArticles(zimCreator, downloader, dump, forceRender)
logger.log(`Fetching Articles finished in ${(Date.now() - stime) / 1000} seconds`)

logger.log(`Found [${jsModuleDependencies.size}] js module dependencies`)
Expand Down Expand Up @@ -607,32 +607,41 @@ async function execute(argv: any) {
return mainPage ? createMainPageRedirect() : createMainPage()
}

async function fetchArticleDetail(articleId: string) {
return await articleDetailXId.get(articleId)
}

async function updateArticleThumbnail(articleDetail: any, articleId: string) {
const imageUrl = articleDetail.thumbnail
if (!imageUrl) return

const { width: oldWidth } = getSizeFromUrl(imageUrl.source)
const suitableResUrl = imageUrl.source.replace(`/${oldWidth}px-`, '/500px-').replace(`-${oldWidth}px-`, '-500px-')
const { mult, width } = getSizeFromUrl(suitableResUrl)
const path = getMediaBase(suitableResUrl, false)

articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true), 'I')

await Promise.all([filesToDownloadXPath.set(path, { url: urlHelper.serializeUrl(suitableResUrl), mult, width } as FileDetail), articleDetailXId.set(articleId, articleDetail)])
}

async function getThumbnailsData(): Promise<void> {
if (customMainPage || !articleList || articleListLines.length <= MIN_IMAGE_THRESHOLD_ARTICLELIST_PAGE) return

logger.log('Updating article thumbnails for articles')

let articleIndex = 0
let articlesWithImages = 0

while (articleIndex < articleListLines.length && articlesWithImages <= 100) {
const articleId = articleListLines[articleIndex]
articleIndex++

try {
const articleDetail = await articleDetailXId.get(articleId)
const articleDetail = await fetchArticleDetail(articleId)
if (!articleDetail) continue

const imageUrl = articleDetail.thumbnail
if (!imageUrl) continue

const { width: oldWidth } = getSizeFromUrl(imageUrl.source)
const suitableResUrl = imageUrl.source.replace(`/${oldWidth}px-`, '/500px-').replace(`-${oldWidth}px-`, '-500px-')
const { mult, width } = getSizeFromUrl(suitableResUrl)
const path = getMediaBase(suitableResUrl, false)
articleDetail.internalThumbnailUrl = getRelativeFilePath('Main_Page', getMediaBase(suitableResUrl, true), 'I')

await Promise.all([
filesToDownloadXPath.set(path, { url: urlHelper.serializeUrl(suitableResUrl), mult, width } as FileDetail),
articleDetailXId.set(articleId, articleDetail),
])
await updateArticleThumbnail(articleDetail, articleId)
articlesWithImages++
} catch (err) {
logger.warn(`Failed to parse thumbnail for [${articleId}], skipping...`)
Expand Down
2 changes: 2 additions & 0 deletions src/parameterList.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ export const parameterDescriptions = {
osTmpDir: 'Override default operating system temporary directory path environment variable',
customFlavour: 'A custom processor that can filter and process articles (see extensions/*.js)',
optimisationCacheUrl: 'S3 url, including credentials and bucket name',
forceRender:
'Force the usage of a specific API end-point/render, automatically chosen otherwise. Accepted values: [ VisualEditor, WikimediaDesktop. WikimediaMobile ]. More details at https://github.com/openzim/mwoffliner/wiki/API-end-points',
}

// TODO: Add an interface based on the object above
95 changes: 49 additions & 46 deletions src/renderers/abstract.renderer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -576,9 +576,7 @@ export abstract class Renderer {
return false
}

private applyOtherTreatments(parsoidDoc: DominoElement, dump: Dump) {
const filtersConfig = config.filters

private clearLinkAndInputTags(parsoidDoc: DominoElement, filtersConfig: any, dump: Dump) {
/* Don't need <link> and <input> tags */
const nodesToDelete: Array<{ class?: string; tag?: string; filter?: (n: any) => boolean }> = [{ tag: 'link' }, { tag: 'input' }]

Expand Down Expand Up @@ -646,6 +644,42 @@ export abstract class Renderer {
}
}
}
}

private clearNodes(parsoidDoc: DominoElement, filtersConfig: any) {
const allNodes: DominoElement[] = Array.from(parsoidDoc.getElementsByTagName('*'))
for (const node of allNodes) {
node.removeAttribute('data-parsoid')
node.removeAttribute('typeof')
node.removeAttribute('about')
node.removeAttribute('data-mw')

if (node.getAttribute('rel') && node.getAttribute('rel').substr(0, 3) === 'mw:') {
node.removeAttribute('rel')
} else if (node.getAttribute('img')) {
/* Remove a few images Parsoid attributes */
node.removeAttribute('data-file-width')
node.removeAttribute('data-file-height')
node.removeAttribute('data-file-type')
}

/* Remove a few css calls */
filtersConfig.cssClassCallsBlackList.map((classname: string) => {
if (node.getAttribute('class')) {
node.setAttribute('class', node.getAttribute('class').replace(classname, ''))
}
})
}

const kartographerMaplinkNodes = Array.from<DominoElement>(parsoidDoc.querySelectorAll('.mw-kartographer-maplink')).filter((n) => !!n.textContent)
for (const node of kartographerMaplinkNodes) {
node.textContent = '🌍'
}
}

private applyOtherTreatments(parsoidDoc: DominoElement, dump: Dump) {
const filtersConfig = config.filters
this.clearLinkAndInputTags(parsoidDoc, filtersConfig, dump)

/* Go through all reference calls */
const spans: DominoElement[] = Array.from(parsoidDoc.getElementsByTagName('span'))
Expand Down Expand Up @@ -682,53 +716,22 @@ export abstract class Renderer {
/* Remove empty paragraphs */
// TODO: Refactor this option to work with page/html and page/mobile-html output. See issues/1866
if (!dump.opts.keepEmptyParagraphs) {
if (!dump.opts.keepEmptyParagraphs) {
// Mobile view === details
// Desktop view === section
const sections: DominoElement[] = Array.from(parsoidDoc.querySelectorAll('details, section'))
for (const section of sections) {
if (
section.children.length ===
Array.from(section.children).filter((child: DominoElement) => {
return child.matches('summary')
}).length
) {
DU.deleteNode(section)
}
// Mobile view === details
// Desktop view === section
const sections: DominoElement[] = Array.from(parsoidDoc.querySelectorAll('details, section'))
for (const section of sections) {
if (
section.children.length ===
Array.from(section.children).filter((child: DominoElement) => {
return child.matches('summary')
}).length
) {
DU.deleteNode(section)
}
}
}

/* Clean the DOM of all uncessary code */
const allNodes: DominoElement[] = Array.from(parsoidDoc.getElementsByTagName('*'))
for (const node of allNodes) {
node.removeAttribute('data-parsoid')
node.removeAttribute('typeof')
node.removeAttribute('about')
node.removeAttribute('data-mw')

if (node.getAttribute('rel') && node.getAttribute('rel').substr(0, 3) === 'mw:') {
node.removeAttribute('rel')
} else if (node.getAttribute('img')) {
/* Remove a few images Parsoid attributes */
node.removeAttribute('data-file-width')
node.removeAttribute('data-file-height')
node.removeAttribute('data-file-type')
}

/* Remove a few css calls */
filtersConfig.cssClassCallsBlackList.map((classname: string) => {
if (node.getAttribute('class')) {
node.setAttribute('class', node.getAttribute('class').replace(classname, ''))
}
})
}

const kartographerMaplinkNodes = Array.from<DominoElement>(parsoidDoc.querySelectorAll('.mw-kartographer-maplink')).filter((n) => !!n.textContent)
for (const node of kartographerMaplinkNodes) {
node.textContent = '🌍'
}

this.clearNodes(parsoidDoc, filtersConfig)
return parsoidDoc
}

Expand Down
30 changes: 29 additions & 1 deletion src/sanitize-argument.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,19 @@ const parametersWithArrayType = ['format']

export async function sanitize_all(argv: any) {
// extracting all arguments
const { articleList, addNamespaces, speed: _speed, adminEmail, mwUrl, customZimFavicon, optimisationCacheUrl, verbose, customZimLongDescription, customZimDescription } = argv
const {
articleList,
addNamespaces,
speed: _speed,
adminEmail,
mwUrl,
customZimFavicon,
optimisationCacheUrl,
verbose,
customZimLongDescription,
customZimDescription,
forceRender,
} = argv

sanitizeDoubleUsedParameters(argv)

Expand Down Expand Up @@ -73,6 +85,11 @@ export async function sanitize_all(argv: any) {
// sanitizing adminEmail
sanitize_adminEmail(adminEmail)

// sanitizing renderer
if (forceRender) {
sanitize_forceRender(forceRender)
}

// Redis client sanitization
// created a redis client and then closed it.
sanitize_redis(argv)
Expand Down Expand Up @@ -173,3 +190,14 @@ export function sanitize_customFlavour(customFlavour: string): string {
}) || null
)
}

export function sanitize_forceRender(renderName: string): string {
const renderNames = ['VisualEditor', 'WikimediaDesktop', 'WikimediaMobile']
const checkRenderName = (arr: string[], val: string) => {
return arr.some((arrVal) => val === arrVal)
}
if (checkRenderName(renderNames, renderName)) {
return renderName
}
throw new Error(`Invalid render name: ${renderName}`)
}
16 changes: 13 additions & 3 deletions src/util/saveArticles.ts
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ export function getArticleUrl(downloader: Downloader, dump: Dump, articleId: str
/*
* Fetch Articles
*/
export async function saveArticles(zimCreator: ZimCreator, downloader: Downloader, dump: Dump) {
export async function saveArticles(zimCreator: ZimCreator, downloader: Downloader, dump: Dump, forceRender = null) {
const jsModuleDependencies = new Set<string>()
const cssModuleDependencies = new Set<string>()
let jsConfigVars = ''
Expand All @@ -258,9 +258,19 @@ export async function saveArticles(zimCreator: ZimCreator, downloader: Downloade
const articlesTotal = await articleDetailXId.len()

const rendererBuilder = new RendererBuilder()
const rendererBuilderOptions: RendererBuilderOptions = {
renderType: 'auto',

let rendererBuilderOptions: RendererBuilderOptions
if (forceRender) {
rendererBuilderOptions = {
renderType: 'specific',
renderName: forceRender,
}
} else {
rendererBuilderOptions = {
renderType: 'auto',
}
}

const mainPageRenderer = await rendererBuilder.createRenderer(rendererBuilderOptions)
// TODO: article renderer will be switched to the mobile mode later
const articlesRenderer = await rendererBuilder.createRenderer(rendererBuilderOptions)
Expand Down
Loading

0 comments on commit 2c5553b

Please sign in to comment.