Skip to content

Commit

Permalink
fix: use local landing page
Browse files Browse the repository at this point in the history
In the past we had to download landing from original revision,
but since 2019 ZIM landings improved and can be used as-is.

This switches /wiki/index.html to use "Main Page" from ZIM file
only with minor adjustments (paths, title)

Old logic is commented out in case we need it for ad-hoc fixups
in some language in the future.
  • Loading branch information
lidel committed Mar 5, 2021
1 parent 0d6b821 commit c4de986
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 28 deletions.
126 changes: 99 additions & 27 deletions src/site-transforms.ts
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ export const fixExceptions = async ({
try {
articleName = decodeURIComponent(file.name)
} catch (e) {
console.error(`unable to decodeURIComponent(${file.name}): `, e)
console.error(`[fixExceptions] unable to decodeURIComponent(${file.name}), skipping `)
continue
}
const segments = articleName.split('/')
Expand All @@ -137,7 +137,7 @@ export const fixExceptions = async ({
// and can be represented as directories
if (!segments.length || segments.some(s => !s.length)) continue

console.log('processing: ' + articleName)
// console.log('processing: ' + articleName)
const suffixFile = segments.pop() || ''

// creation of index.html breaks links created by zimdump:
Expand All @@ -152,8 +152,8 @@ export const fixExceptions = async ({
reworkScriptSrcs($fileHtml, 'img', linkFixups)
reworkScriptSrcs($fileHtml, 'script', linkFixups)

console.log(` fixed relative paths in ${filePath}`)
renameSync(filePath, `${filePath}.original`)
// console.log(` fixed relative paths in ${filePath}`)
// renameSync(filePath, `${filePath}.original`)
writeFileSync(filePath, $fileHtml.html())
}

Expand All @@ -162,21 +162,21 @@ export const fixExceptions = async ({
// ensure dir at each level exists and has no conflict
for (let i = 1; i < segments.length+1; i++) {
const parentDir = join(wikiFolder, ...segments.slice(0,i))
console.log(' checking parentDir: ' + parentDir)
// console.log(' checking parentDir: ' + parentDir)
if (existsSync(parentDir)) {
if (lstatSync(parentDir).isFile()) {
// If a file exists under the name of a directory we need,
// move file into a newly created dir
const articleTmp = `${parentDir}.tmp`
const articleDst = join(parentDir, 'index.html')
console.log(` parentDir is a file, renaming to ${articleDst}`)
// console.log(` parentDir is a file, renaming to ${articleDst}`)
renameSync(parentDir, articleTmp)
mkdirSync(parentDir, { recursive: true })
renameSync(articleTmp, articleDst)
fixRelativeLinks(articleDst, i)
}
} else {
console.log(` created parentDir`)
// console.log(` created parentDir`)
mkdirSync(parentDir, { recursive: true })
}
}
Expand All @@ -186,15 +186,15 @@ export const fixExceptions = async ({
const articleDir = join(wikiFolder, ...segments)
const articleDst = join(articleDir, suffixFile)

console.log(` renaming ${articleSrc}`)
// console.log(` renaming ${articleSrc}`)

if (existsSync(articleDst) && lstatSync(articleDst).isDirectory()) {
console.log(` directory already, renaming to ${articleDst}/index.html`)
// console.log(` directory already, renaming to ${articleDst}/index.html`)
const movedArticleDst = join(articleDst, 'index.html')
renameSync(articleSrc, movedArticleDst)
fixRelativeLinks(movedArticleDst, 1)
} else {
console.log(` renamed to ${articleDst}`)
// console.log(` renamed to ${articleDst}`)
renameSync(articleSrc, articleDst)
}
}
Expand Down Expand Up @@ -237,6 +237,8 @@ export const insertIndexRedirect = (options: Options) => {
unlinkSync(wikiIndexPath)
}

// note that this is temporary stub, we most likely
// override this template with a better landing during later steps
writeFileSync(
wikiIndexPath,
template({
Expand Down Expand Up @@ -264,14 +266,91 @@ export const resolveDirectories = (options: Options) => {
return directories
}

// Gets path to an article after unpacking and fixExceptions fixups
const unpackedArticlePath = (wikiFolder: string, article: string) => {
let articlePath = join(wikiFolder, article)
if (!existsSync(articlePath)) throw new Error(`unpacked '/wiki/${article}' is missing`)
if (lstatSync(articlePath).isDirectory()) {
const fixedSrc = join(articlePath, 'index.html')
if (existsSync(fixedSrc)) {
return fixedSrc
} else {
throw new Error(`unpacked '/wiki/${article}' is a dir without index.html`)
}
}
return articlePath
}


// We copy "kiwix main page" to /wiki/index.html + adjust links.
// This way original one can still be loaded if needed
// Example for tr:
// /wiki/index.html is https://tr.wikipedia.org/wiki/Kullanıcı:The_other_Kiwix_guy/Landing
// /wiki/Anasayfa is https://tr.wikipedia.org/wiki/Anasayfa
export const useKiwixLandingPage = async (
options: Options,
{ wikiFolder, imagesFolder }: Directories
) => {

cli.action.start(` Generating landing page at /wiki/ from Kiwix one at /wiki/${options.kiwixMainPage}`)

const landingPagePath = join(wikiFolder, 'index.html')
const originalMainPageSrc = unpackedArticlePath(wikiFolder, options.mainPage)
const kiwixMainPageSrc = unpackedArticlePath(wikiFolder, options.kiwixMainPage)

// Use main page from Kiwix as the landing:
// In most cases it is already the best landing available
copyFileSync(kiwixMainPageSrc, landingPagePath)

// Tweak page title of custom landing created by The_other_Kiwix_guy :-)
if (kiwixMainPageSrc.includes('The_other_Kiwix_guy') && existsSync(originalMainPageSrc)) {
// Set title to one from canonical main page
const $landingHtml = cheerio.load(readFileSync(landingPagePath).toString())
const canonicalUrlString = $landingHtml('link[rel="canonical"]').attr('href')
if (!canonicalUrlString) {
throw new Error(`Could not parse out canonical url for ${canonicalUrlString}`)
}
const canonicalUrl = new URL(canonicalUrlString)
canonicalUrl.pathname = `wiki/${options.mainPage}`
const response = await fetch(canonicalUrl)
const pageBody = await response.text()
const $remoteMainPageHtml = cheerio.load(pageBody)
const pageTitle = $remoteMainPageHtml('title').text()
$landingHtml('title').text(pageTitle)
writeFileSync(landingPagePath, $landingHtml.html())
}

// Fixup relative paths, if needed
const depth = (options.kiwixMainPage.match(/\//g) || []).length
if (depth) {
const fixRelativeLinksUp = (filePath: string, depth: number) => {
const fileBytes = readFileSync(filePath)
const $fileHtml = cheerio.load(fileBytes.toString())

const linkFixups = Array.from({ length: depth }, (x, i) => moveRelativeLinksUpOneLevel)
reworkLinks($fileHtml, 'a:not(a[href^="http"]):not(a[href^="//"])', linkFixups)
reworkLinks($fileHtml, 'link[href^="../"]', linkFixups)
reworkScriptSrcs($fileHtml, 'img', linkFixups)
reworkScriptSrcs($fileHtml, 'script', linkFixups)

// console.log(` fixed relative paths in ${filePath}`)
// renameSync(filePath, `${filePath}.original`)
writeFileSync(filePath, $fileHtml.html())
}
fixRelativeLinksUp(landingPagePath, depth)
}

cli.action.stop()
}

// This is usually not used nor needed, but we keep this code around
// in case we need to generate some language quickly and there is a bug in ZIM
// that makes main page unusable.
// With this, we are able to fetch corresponding revision from upstream wikipedia
// and replace ZIM article with upstream one + fixup links and images.
// (This is no longer needed for most ZIMs after we switched to upstream zim-tools)
export const generateMainPage = async (
/*
export const fetchOriginalMainPage = async (
options: Options,
{ wikiFolder, imagesFolder }: Directories
) => {
Expand All @@ -286,25 +365,18 @@ export const generateMainPage = async (
cli.action.start(` Generating main page into /wiki/`)
const kiwixMainPageSrc = join(wikiFolder, `${options.kiwixMainPage}`)
let kiwixMainPageSrc = join(wikiFolder, `${options.kiwixMainPage}`)
// This is a crude fix that replaces exploded dir with single html
// just to fix main pages that happen to end up in _exceptions.
// A proper fix is needed for regular articles: https://github.com/ipfs/distributed-wikipedia-mirror/issues/80
// Handle namespace conflicts resolved in fixExceptions step
if (lstatSync(kiwixMainPageSrc).isDirectory()) {
const exceptionsPage = join(options.unpackedZimDir, '_exceptions', `A%2f${options.kiwixMainPage}`)
if (existsSync(exceptionsPage)) {
rmdirSync(kiwixMainPageSrc, { recursive: true })
copyFileSync(exceptionsPage, kiwixMainPageSrc)
const fixedSrc = `${kiwixMainPageSrc}/index.html`
if (existsSync(fixedSrc)) {
kiwixMainPageSrc = fixedSrc
} else {
throw new Error(`kiwixMainPageSrc "${kiwixMainPageSrc}" is a dir without index.html`)
}
}
cli.action.stop()

/*
cli.action.start(` Generating main page into ${mainPagePath} `)
const kiwixMainpage = readFileSync(kiwixMainPageSrc)
const $kiwixMainPageHtml = cheerio.load(kiwixMainpage.toString())
Expand Down Expand Up @@ -344,7 +416,7 @@ export const generateMainPage = async (
const $remoteMainPageHtml = cheerio.load(pageBody)
const $remoteContent = $remoteMainPageHtml('#content')
const remotePageTitle = $remoteMainPageHtml('title').text()
const remotePageTitle = $remoteMainPageHtml('title').text().replace(':The other Kiwix guy/', '')
$remoteContent.addClass('content')
$remoteContent.find('#siteNotice').remove()
Expand Down Expand Up @@ -443,8 +515,8 @@ export const generateMainPage = async (
} catch (error) {
cli.error(error)
}
*/
}
*/

export const appendJavscript = (
options: Options,
Expand Down
5 changes: 4 additions & 1 deletion src/zim-to-website.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import {
fixFavicon,
fixExceptions,
// generateMainPage,
useKiwixLandingPage,
insertIndexRedirect,
moveArticleFolderToWiki,
resolveDirectories
Expand Down Expand Up @@ -51,7 +52,9 @@ export const zimToWebsite = async (options: Options) => {
await fixExceptions(directories)
insertIndexRedirect(options)
appendJavascript(options, directories)
// usually main page is ok, so we dont need below
await useKiwixLandingPage(options, directories)

// usually main page from kiwix is ok, so we dont need below
// await generateMainPage(options, directories)

cli.log('done')
Expand Down

0 comments on commit c4de986

Please sign in to comment.