Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce MediaWiki Parsoid API to render articles #1899

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/Downloader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -181,12 +181,14 @@ class Downloader {
public async setBaseUrls() {
//* Objects order in array matters!
this.baseUrl = basicURLDirector.buildDownloaderBaseUrl([
{ condition: await MediaWiki.hasMediawikiParsoidApi(), value: MediaWiki.useParsoidApiUrl.href },
{ condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href },
{ condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href },
])

//* Objects order in array matters!
this.baseUrlForMainPage = basicURLDirector.buildDownloaderBaseUrl([
{ condition: await MediaWiki.hasMediawikiParsoidApi(), value: MediaWiki.useParsoidApiUrl.href },
{ condition: await MediaWiki.hasWikimediaDesktopRestApi(), value: MediaWiki.desktopRestApiUrl.href },
{ condition: await MediaWiki.hasVisualEditorApi(), value: MediaWiki.visualEditorApiUrl.href },
])
Expand Down
15 changes: 15 additions & 0 deletions src/MediaWiki.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import BaseURLDirector from './util/builders/url/base.director.js'
import ApiURLDirector from './util/builders/url/api.director.js'
import DesktopURLDirector from './util/builders/url/desktop.director.js'
import VisualEditorURLDirector from './util/builders/url/visual-editor.director.js'
import UseParsoidURLDirector from './util/builders/url/use-parsoid.director.js'
import { checkApiAvailability } from './util/mw-api.js'

class MediaWiki {
Expand Down Expand Up @@ -40,8 +41,10 @@ class MediaWiki {
private apiUrlDirector: ApiURLDirector
private wikimediaDesktopUrlDirector: DesktopURLDirector
private visualEditorURLDirector: VisualEditorURLDirector
private useParsoidURLDirector: UseParsoidURLDirector

public visualEditorApiUrl: URL
public useParsoidApiUrl: URL
public apiUrl: URL
public modulePath: string // only for reading
public _modulePathOpt: string // only for whiting to generate modulePath
Expand All @@ -50,6 +53,7 @@ class MediaWiki {

#hasWikimediaDesktopRestApi: boolean | null
#hasVisualEditorApi: boolean | null
#hasMediawikiParsoidApi: boolean | null

set username(value: string) {
this.#username = value
Expand Down Expand Up @@ -99,6 +103,7 @@ class MediaWiki {

this.#hasWikimediaDesktopRestApi = null
this.#hasVisualEditorApi = null
this.#hasMediawikiParsoidApi = null
}

private constructor() {
Expand All @@ -121,16 +126,26 @@ class MediaWiki {
return this.#hasVisualEditorApi
}

public async hasMediawikiParsoidApi(): Promise<boolean> {
if (this.#hasMediawikiParsoidApi === null) {
this.#hasMediawikiParsoidApi = await checkApiAvailability(this.useParsoidURLDirector.buildArticleURL(this.apiCheckArticleId))
return this.#hasMediawikiParsoidApi
}
return this.#hasMediawikiParsoidApi
}

private initMWApis() {
const baseUrlDirector = new BaseURLDirector(this.baseUrl.href)
this.webUrl = baseUrlDirector.buildURL(this.#wikiPath)
this.apiUrl = baseUrlDirector.buildURL(this.#apiPath)
this.apiUrlDirector = new ApiURLDirector(this.apiUrl.href)
this.visualEditorApiUrl = this.apiUrlDirector.buildVisualEditorURL()
this.useParsoidApiUrl = this.apiUrlDirector.buildUseParsoidURL()
this.desktopRestApiUrl = baseUrlDirector.buildDesktopRestApiURL(this.#restApiPath)
this.modulePath = baseUrlDirector.buildModuleURL(this._modulePathOpt)
this.wikimediaDesktopUrlDirector = new DesktopURLDirector(this.desktopRestApiUrl.href)
this.visualEditorURLDirector = new VisualEditorURLDirector(this.visualEditorApiUrl.href)
this.useParsoidURLDirector = new UseParsoidURLDirector(this.useParsoidApiUrl.href)
}

public async login(downloader: Downloader) {
Expand Down
1 change: 1 addition & 0 deletions src/mwoffliner.lib.ts
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ async function execute(argv: any) {
}

MediaWiki.apiCheckArticleId = mwMetaData.mainPage
await MediaWiki.hasMediawikiParsoidApi()
await MediaWiki.hasWikimediaDesktopRestApi()
await MediaWiki.hasVisualEditorApi()

Expand Down
4 changes: 4 additions & 0 deletions src/util/builders/url/api.director.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ export default class ApiURLDirector {
return urlBuilder.setDomain(this.baseDomain).setQueryParams({ action: 'visualeditor', mobileformat: 'html', format: 'json', paction: 'parse', page: '' }).build(true)
}

buildUseParsoidURL() {
return urlBuilder.setDomain(this.baseDomain).setQueryParams({ action: 'parse', format: 'json', prop: 'text|modules|jsconfigvars|headhtml', parsoid: '1', page: '' }).build(true)
}

buildArticleApiURL(articleId: string) {
const domain = this.buildBaseArticleURL()

Expand Down
19 changes: 19 additions & 0 deletions src/util/builders/url/use-parsoid.director.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import urlBuilder from './url.builder.js'

/**
* Interface to build URLs based on MediaWiki visual editor URL
*/
export default class UseParsoidURLDirector {
baseDomain: string

constructor(baseDomain: string) {
this.baseDomain = baseDomain
}

buildArticleURL(articleId: string) {
return urlBuilder
.setDomain(this.baseDomain)
.setQueryParams({ page: encodeURIComponent(articleId) }, '&')
.build()
}
}
2 changes: 1 addition & 1 deletion src/util/mw-api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ export function mwRetToArticleDetail(obj: QueryMwRet): KVS<ArticleDetail> {
export async function checkApiAvailability(url: string, loginCookie = ''): Promise<boolean> {
try {
const resp = await axios.get(url, { maxRedirects: 0, headers: { cookie: loginCookie } })
return resp.status === 200 && !resp.headers['mediawiki-api-error']
return resp.status === 200 && !resp.headers['mediawiki-api-error'] && !(resp.data.warnings?.main['*'] === 'Unrecognized parameter: parsoid.')
} catch (err) {
return false
}
Expand Down
2 changes: 1 addition & 1 deletion src/util/renderers/abstract.renderer.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
type renderType = 'auto' | 'desktop' | 'mobile' | 'specific'
type renderName = 'VisualEditor' | 'WikimediaDesktop' | 'WikimediaMobile'
type renderName = 'VisualEditor' | 'WikimediaDesktop' | 'WikimediaMobile' | 'MediawikiParsoid'

interface RendererBuilderOptionsBase {
renderType: renderType
Expand Down
105 changes: 105 additions & 0 deletions src/util/renderers/mediawiki-parsoid-renderer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import domino from 'domino'
import { DELETED_ARTICLE_ERROR } from '../const.js'
import * as logger from '../../Logger.js'
import { Renderer } from './abstract.renderer.js'
import { getStrippedTitleFromHtml } from '../misc.js'
import { RenderOpts } from './abstract.renderer.js'

/*
Represent 'https://{wikimedia-wiki}/w/api.php?action=parse&format=json&prop=text|revid|modules|jsconfigvars|headhtml|text&parsoid=1&formatversion=2&page={title}'
or
https://{3rd-part-wikimedia-wiki}/w/api.php?action=parse&format=json&prop=text|revid|modules|jsconfigvars|headhtml|text&parsoid=1&formatversion=2&page={title}
*/
export class MediawikiParsoidRenderer extends Renderer {
constructor() {
super()
}

public async render(renderOpts: RenderOpts): Promise<any> {
const { data, isMainPage, articleId, articleDetail } = renderOpts

if (!data) {
throw new Error(`Cannot render [${data}] into an article`)
}

let strippedTitle: string
const result = []
if (data.parse) {
// Testing if article has been deleted between fetching list and downloading content.
if (data.parse.revid === 0) {
logger.error(DELETED_ARTICLE_ERROR)
throw new Error(DELETED_ARTICLE_ERROR)
}
const dataHtml = isMainPage ? this.removeNoscript(data.parse.text['*']) : this.injectHeader(this.removeNoscript(data.parse.text['*']), articleDetail)
strippedTitle = getStrippedTitleFromHtml(dataHtml)
result.push({
articleId,
displayTitle: strippedTitle || articleId.replace('_', ' '),
html: dataHtml,
modules: data.parse.modules || '',
modulescripts: data.parse.modulescripts || '',
modulestyles: data.parse.modulestyles || '',
headhtml: data.parse.headhtml['*'] || '',
})
return result
} else if (data.error) {
logger.error(`Error in retrieved article [${articleId}]:`, data.error)
return ''
}
logger.error('Unable to parse data from mediawiki parsoid')
return ''
}

// TODO: this was moved to the abstract renderer in PR1886
private injectHeader(content: string, articleDetail: any): string {
const doc = domino.createDocument(content)
const header = doc.createElement('h1')

if (articleDetail?.title) {
header.appendChild(doc.createTextNode(articleDetail.title))
header.classList.add('article-header')

const target = doc.querySelector('body.mw-body-content') || doc.querySelector('body')

if (target) {
target.insertAdjacentElement('afterbegin', header)
}
}

return doc.documentElement.outerHTML
}

// Remove noscript elements but preserve inner content
private removeNoscript(content: string) {
const doc = domino.createDocument(content)
const noscriptNodes = Array.from(doc.querySelectorAll('noscript'))

if (noscriptNodes && noscriptNodes.length > 0) {
noscriptNodes.forEach((noscriptEl) => {
const noscriptElParent = noscriptEl.parentNode

if (noscriptElParent) {
// Transfer noscript children into the parent node
while (noscriptEl.firstChild) {
if (noscriptEl.firstChild.nodeType === doc.TEXT_NODE) {
const domElem = domino.createDocument(noscriptEl.innerHTML).documentElement
// Remove any text content as it's no longer needed
noscriptEl.removeChild(noscriptEl.firstChild)
// Retrieve img from noscript
const imgs = Array.from(domElem.querySelectorAll('img'))
imgs.forEach((img) => {
noscriptEl.appendChild(img)
})
}
noscriptElParent.insertBefore(noscriptEl.firstChild, noscriptEl)
}

// Remove noscript along with children
noscriptElParent.removeChild(noscriptEl)
}
})
}

return doc.documentElement.outerHTML
}
}
21 changes: 18 additions & 3 deletions src/util/renderers/renderer.builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,25 @@ import MediaWiki from './../../MediaWiki.js'
import { Renderer } from './abstract.renderer.js'
import { VisualEditorRenderer } from './visual-editor.renderer.js'
import { WikimediaDesktopRenderer } from './wikimedia-desktop.renderer.js'
import { MediawikiParsoidRenderer } from './mediawiki-parsoid-renderer.js'
import { RendererBuilderOptions } from './abstract.renderer.js'
import * as logger from './../../Logger.js'

export class RendererBuilder {
public async createRenderer(options: RendererBuilderOptions): Promise<Renderer> {
const { renderType, renderName } = options

const [hasVisualEditorApi, hasWikimediaDesktopRestApi] = await Promise.all([MediaWiki.hasVisualEditorApi(), MediaWiki.hasWikimediaDesktopRestApi()])
const [hasVisualEditorApi, hasWikimediaDesktopRestApi, hasMediawikiParsoidApi] = await Promise.all([
MediaWiki.hasVisualEditorApi(),
MediaWiki.hasWikimediaDesktopRestApi(),
MediaWiki.hasMediawikiParsoidApi(),
])

switch (renderType) {
case 'desktop':
if (hasWikimediaDesktopRestApi) {
if (hasMediawikiParsoidApi) {
return new MediawikiParsoidRenderer()
} else if (hasWikimediaDesktopRestApi) {
// Choose WikimediaDesktopRenderer if it's present, regardless of hasVisualEditorApi value
return new WikimediaDesktopRenderer()
} else if (hasVisualEditorApi) {
Expand All @@ -26,7 +33,9 @@ export class RendererBuilder {
// TODO: return WikimediaMobile renderer
break
case 'auto':
if (hasWikimediaDesktopRestApi) {
if (hasMediawikiParsoidApi) {
return new MediawikiParsoidRenderer()
} else if (hasWikimediaDesktopRestApi) {
// Choose WikimediaDesktopRenderer if it's present, regardless of hasVisualEditorApi value
return new WikimediaDesktopRenderer()
} else if (hasVisualEditorApi) {
Expand All @@ -38,6 +47,12 @@ export class RendererBuilder {
case 'specific':
// renderName argument is required for 'specific' mode
switch (renderName) {
case 'MediawikiParsoid':
if (hasMediawikiParsoidApi) {
return new MediawikiParsoidRenderer()
}
logger.error('Cannot create an instance of MediawikiParsoid renderer.')
process.exit(1)
case 'WikimediaDesktop':
if (hasWikimediaDesktopRestApi) {
return new WikimediaDesktopRenderer()
Expand Down
Loading