Skip to content

Commit

Permalink
Merge pull request #88 from ipfs/feat/handle-exceptions
Browse files Browse the repository at this point in the history
feat: handle _exceptions directory
  • Loading branch information
lidel authored Mar 5, 2021
2 parents 325940d + c4de986 commit e3f02a7
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 40 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,6 @@ notes.md
/snapshots
node_modules
/zim-tools
/kiwix-tools
/kiwix-tools

bin/zimdump
13 changes: 5 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,17 @@ RUN apt update
RUN apt -y install --no-install-recommends git ca-certificates curl wget apt-utils

# install:
# - zimdump from zim-tools_linux-x86_64-2021-02-12 (2.2.0 nightly)
# - node and yarn
# - go-ipfs
RUN curl - sL https://ipfs.io/ipfs/QmaXutNuSv9T7w62TzMMKUgtxs9g81GvMt1vKqduLn77Yj -o /usr/local/bin/zimdump \
&& chmod +x /usr/local/bin/zimdump \
&& curl -sL https://deb.nodesource.com/setup_14.x -o nodesource_setup.sh \
RUN curl -sL https://deb.nodesource.com/setup_14.x -o nodesource_setup.sh \
&& bash nodesource_setup.sh \
&& apt -y install --no-install-recommends nodejs \
&& npm install -g yarn \
&& wget -nv https://dist.ipfs.io/go-ipfs/v0.7.0/go-ipfs_v0.7.0_linux-amd64.tar.gz \
&& tar xvfz go-ipfs_v0.7.0_linux-amd64.tar.gz \
&& wget -nv https://dist.ipfs.io/go-ipfs/v0.8.0/go-ipfs_v0.8.0_linux-amd64.tar.gz \
&& tar xvfz go-ipfs_v0.8.0_linux-amd64.tar.gz \
&& mv go-ipfs/ipfs /usr/local/bin/ipfs \
&& rm -r go-ipfs && rm go-ipfs_v0.7.0_linux-amd64.tar.gz \
&& ipfs init --profile badgerds --empty-repo \
&& rm -r go-ipfs && rm go-ipfs_v0.8.0_linux-amd64.tar.gz \
&& ipfs init -p server,local-discovery,flatfs,randomports --empty-repo \
&& ipfs config --json 'Experimental.ShardingEnabled' true

# TODO: move repo init after external volume is mounted
Expand Down
5 changes: 4 additions & 1 deletion mirrorzim.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,11 @@ if [ -z ${MAIN_PAGE_VERSION+x} ]; then
MAIN_PAGE_VERSION=""
fi

printf "\nEnsure zimdump is present...\n"
PATH=$PATH:$(realpath ./bin)
which zimdump &> /dev/null || (curl --progress-bar -L https://ipfs.io/ipfs/bafybeibotxexiycu4luq7b6d6sh3oi2t2cvpvbimms6rpmbalbbyfrddyq -o ./bin/zimdump && chmod +x ./bin/zimdump)

printf "\nDownload the zim file...\n"
printf "\nDownload and verify the zim file...\n"
ZIM_FILE_SOURCE_URL="$(./tools/getzim.sh download $WIKI_TYPE $WIKI_TYPE $LANGUAGE_CODE all maxi latest | grep 'URL:' | cut -d' ' -f3)"
ZIM_FILE=$(echo $ZIM_FILE_SOURCE_URL | rev | cut -d'/' -f1 | rev)
TMP_DIRECTORY="./tmp/$(echo $ZIM_FILE | cut -d'.' -f1)"
Expand Down
3 changes: 3 additions & 0 deletions src/article-transforms.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ export const moveRelativeLinksUpOneLevel = (href: string) => {
}

export const moveRelativeLinksDownOneLevel = (href: string) => {
if (!(href.startsWith('../') || href.startsWith('http') || href.startsWith('//'))) {
return `../${href}`
}
return href.replace('../', '../../')
}

Expand Down
210 changes: 186 additions & 24 deletions src/site-transforms.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import {
appendHtmlPostfix,
makeScriptLinksRelativeToWiki,
moveRelativeLinksUpOneLevel,
moveRelativeLinksDownOneLevel,
prefixRelativeRoot,
reworkLinks,
reworkScriptSrcs
Expand Down Expand Up @@ -94,7 +95,21 @@ export const fixExceptions = async ({
wikiFolder
}: Directories) => {

/* TODO this needs more work
/*
for every FOO directory in wiki/FOO
find article _exceptions/A%2fFOO
if exists, move it to wiki/FOO/index.html
for every file matching _exceptions/A%2f*
split name into segments
for each but the last segment
check if wiki/FOO exists,
if exists and is a directory, do nothing
if does not exist, create a dir
if exists, but is a file, replace file with a dir, and move file to FOO/index.html
finally, write last segment under wiki/FOO/bar
*/

// Articles with "/" in namei like "foo/bar" produce conflicts and those are saved under
// url-escaped flat-files in exceptions directory
// What we do here is to take every "foo" exception and rename it to foo/index.html,
Expand All @@ -105,23 +120,84 @@ export const fixExceptions = async ({
}
const dir = opendirSync(exceptionsDir)
for await (let file of dir) {
const articleName = decodeURIComponent(file.name)
console.log(articleName)
let articleName
try {
articleName = decodeURIComponent(file.name)
} catch (e) {
console.error(`[fixExceptions] unable to decodeURIComponent(${file.name}), skipping `)
continue
}
const segments = articleName.split('/')

// only process exceptions from A/ namespace
if (segments[0] !== 'A') continue
segments[0] = 'wiki'
segments.shift() // remove A/

// only process exceptions where segments have 1+ length
// and can be represented as directories
if (!segments.length || segments.some(s => !s.length)) continue

// console.log('processing: ' + articleName)
const suffixFile = segments.pop() || ''

// creation of index.html breaks links created by zimdump:
// needs manual adjustment of relative links to be prefixed with ../
const fixRelativeLinks = (filePath: string, depth: number) => {
const fileBytes = readFileSync(filePath)
const $fileHtml = cheerio.load(fileBytes.toString())

const linkFixups = Array.from({ length: depth }, (x, i) => moveRelativeLinksDownOneLevel)
reworkLinks($fileHtml, 'a:not(a[href^="http"]):not(a[href^="//"])', linkFixups)
reworkLinks($fileHtml, 'link[href^="../"]', linkFixups)
reworkScriptSrcs($fileHtml, 'img', linkFixups)
reworkScriptSrcs($fileHtml, 'script', linkFixups)

// console.log(` fixed relative paths in ${filePath}`)
// renameSync(filePath, `${filePath}.original`)
writeFileSync(filePath, $fileHtml.html())
}

const articleDir = join(unpackedZimDir, ...segments)
if (!existsSync(articleDir)) {
// problem: articleDir may not exist and neither its parent,
// and the root one is a file and not a dir (eg A/Australia/Foo/index.html blocked by A/Australia flat article)
mkdirSync(articleDir, { recursive: true })
// if article is not A/foo but A/FOO/bar parent dirs need to be inspected
if (segments.length) {
// ensure dir at each level exists and has no conflict
for (let i = 1; i < segments.length+1; i++) {
const parentDir = join(wikiFolder, ...segments.slice(0,i))
// console.log(' checking parentDir: ' + parentDir)
if (existsSync(parentDir)) {
if (lstatSync(parentDir).isFile()) {
// If a file exists under the name of a directory we need,
// move file into a newly created dir
const articleTmp = `${parentDir}.tmp`
const articleDst = join(parentDir, 'index.html')
// console.log(` parentDir is a file, renaming to ${articleDst}`)
renameSync(parentDir, articleTmp)
mkdirSync(parentDir, { recursive: true })
renameSync(articleTmp, articleDst)
fixRelativeLinks(articleDst, i)
}
} else {
// console.log(` created parentDir`)
mkdirSync(parentDir, { recursive: true })
}
}
}

const articleSrc = join(exceptionsDir, file.name)
const articleDest = join(articleDir, 'index.html')
renameSync(articleSrc, articleDest)
}
*/
const articleDir = join(wikiFolder, ...segments)
const articleDst = join(articleDir, suffixFile)

// console.log(` renaming ${articleSrc}`)

if (existsSync(articleDst) && lstatSync(articleDst).isDirectory()) {
// console.log(` directory already, renaming to ${articleDst}/index.html`)
const movedArticleDst = join(articleDst, 'index.html')
renameSync(articleSrc, movedArticleDst)
fixRelativeLinks(movedArticleDst, 1)
} else {
// console.log(` renamed to ${articleDst}`)
renameSync(articleSrc, articleDst)
}
}
// TODO: remove _exceptions?
}

Expand Down Expand Up @@ -161,6 +237,8 @@ export const insertIndexRedirect = (options: Options) => {
unlinkSync(wikiIndexPath)
}

// note that this is temporary stub, we most likely
// override this template with a better landing during later steps
writeFileSync(
wikiIndexPath,
template({
Expand Down Expand Up @@ -188,7 +266,91 @@ export const resolveDirectories = (options: Options) => {
return directories
}

export const generateMainPage = async (
// Gets path to an article after unpacking and fixExceptions fixups
const unpackedArticlePath = (wikiFolder: string, article: string) => {
let articlePath = join(wikiFolder, article)
if (!existsSync(articlePath)) throw new Error(`unpacked '/wiki/${article}' is missing`)
if (lstatSync(articlePath).isDirectory()) {
const fixedSrc = join(articlePath, 'index.html')
if (existsSync(fixedSrc)) {
return fixedSrc
} else {
throw new Error(`unpacked '/wiki/${article}' is a dir without index.html`)
}
}
return articlePath
}


// We copy "kiwix main page" to /wiki/index.html + adjust links.
// This way original one can still be loaded if needed
// Example for tr:
// /wiki/index.html is https://tr.wikipedia.org/wiki/Kullanıcı:The_other_Kiwix_guy/Landing
// /wiki/Anasayfa is https://tr.wikipedia.org/wiki/Anasayfa
export const useKiwixLandingPage = async (
options: Options,
{ wikiFolder, imagesFolder }: Directories
) => {

cli.action.start(` Generating landing page at /wiki/ from Kiwix one at /wiki/${options.kiwixMainPage}`)

const landingPagePath = join(wikiFolder, 'index.html')
const originalMainPageSrc = unpackedArticlePath(wikiFolder, options.mainPage)
const kiwixMainPageSrc = unpackedArticlePath(wikiFolder, options.kiwixMainPage)

// Use main page from Kiwix as the landing:
// In most cases it is already the best landing available
copyFileSync(kiwixMainPageSrc, landingPagePath)

// Tweak page title of custom landing created by The_other_Kiwix_guy :-)
if (kiwixMainPageSrc.includes('The_other_Kiwix_guy') && existsSync(originalMainPageSrc)) {
// Set title to one from canonical main page
const $landingHtml = cheerio.load(readFileSync(landingPagePath).toString())
const canonicalUrlString = $landingHtml('link[rel="canonical"]').attr('href')
if (!canonicalUrlString) {
throw new Error(`Could not parse out canonical url for ${canonicalUrlString}`)
}
const canonicalUrl = new URL(canonicalUrlString)
canonicalUrl.pathname = `wiki/${options.mainPage}`
const response = await fetch(canonicalUrl)
const pageBody = await response.text()
const $remoteMainPageHtml = cheerio.load(pageBody)
const pageTitle = $remoteMainPageHtml('title').text()
$landingHtml('title').text(pageTitle)
writeFileSync(landingPagePath, $landingHtml.html())
}

// Fixup relative paths, if needed
const depth = (options.kiwixMainPage.match(/\//g) || []).length
if (depth) {
const fixRelativeLinksUp = (filePath: string, depth: number) => {
const fileBytes = readFileSync(filePath)
const $fileHtml = cheerio.load(fileBytes.toString())

const linkFixups = Array.from({ length: depth }, (x, i) => moveRelativeLinksUpOneLevel)
reworkLinks($fileHtml, 'a:not(a[href^="http"]):not(a[href^="//"])', linkFixups)
reworkLinks($fileHtml, 'link[href^="../"]', linkFixups)
reworkScriptSrcs($fileHtml, 'img', linkFixups)
reworkScriptSrcs($fileHtml, 'script', linkFixups)

// console.log(` fixed relative paths in ${filePath}`)
// renameSync(filePath, `${filePath}.original`)
writeFileSync(filePath, $fileHtml.html())
}
fixRelativeLinksUp(landingPagePath, depth)
}

cli.action.stop()
}

// This is usually not used nor needed, but we keep this code around
// in case we need to generate some language quickly and there is a bug in ZIM
// that makes main page unusable.
// With this, we are able to fetch corresponding revision from upstream wikipedia
// and replace ZIM article with upstream one + fixup links and images.
// (This is no longer needed for most ZIMs after we switched to upstream zim-tools)
/*
export const fetchOriginalMainPage = async (
options: Options,
{ wikiFolder, imagesFolder }: Directories
) => {
Expand All @@ -201,18 +363,17 @@ export const generateMainPage = async (
// /wiki/Anasayfa is https://tr.wikipedia.org/wiki/Anasayfa
const mainPagePath = join(wikiFolder, 'index.html')
cli.action.start(` Generating main page into ${mainPagePath} `)
cli.action.start(` Generating main page into /wiki/`)
const kiwixMainPageSrc = join(wikiFolder, `${options.kiwixMainPage}`)
let kiwixMainPageSrc = join(wikiFolder, `${options.kiwixMainPage}`)
// This is a crude fix that replaces exploded dir with single html
// just to fix main pages that happen to end up in _exceptions.
// A proper fix is needed for regular articles: https://github.com/ipfs/distributed-wikipedia-mirror/issues/80
// Handle namespace conflicts resolved in fixExceptions step
if (lstatSync(kiwixMainPageSrc).isDirectory()) {
const exceptionsPage = join(options.unpackedZimDir, '_exceptions', `A%2f${options.kiwixMainPage}`)
if (existsSync(exceptionsPage)) {
rmdirSync(kiwixMainPageSrc, { recursive: true })
renameSync(exceptionsPage, kiwixMainPageSrc)
const fixedSrc = `${kiwixMainPageSrc}/index.html`
if (existsSync(fixedSrc)) {
kiwixMainPageSrc = fixedSrc
} else {
throw new Error(`kiwixMainPageSrc "${kiwixMainPageSrc}" is a dir without index.html`)
}
}
Expand Down Expand Up @@ -255,7 +416,7 @@ export const generateMainPage = async (
const $remoteMainPageHtml = cheerio.load(pageBody)
const $remoteContent = $remoteMainPageHtml('#content')
const remotePageTitle = $remoteMainPageHtml('title').text()
const remotePageTitle = $remoteMainPageHtml('title').text().replace(':The other Kiwix guy/', '')
$remoteContent.addClass('content')
$remoteContent.find('#siteNotice').remove()
Expand Down Expand Up @@ -355,6 +516,7 @@ export const generateMainPage = async (
cli.error(error)
}
}
*/

export const appendJavscript = (
options: Options,
Expand Down
12 changes: 8 additions & 4 deletions src/zim-to-website.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ import {
copyImageAssetsIntoWiki,
fixFavicon,
fixExceptions,
generateMainPage,
// generateMainPage,
useKiwixLandingPage,
insertIndexRedirect,
moveArticleFolderToWiki,
resolveDirectories
Expand All @@ -24,8 +25,8 @@ export const zimToWebsite = async (options: Options) => {
cli.log('-------------------------')
cli.log(` Unpacked Zim Directory: ${options.unpackedZimDir}`)
cli.log(` Zim File: ${options.zimFile}`)
cli.log(` Main Page: ${options.mainPage}`)
cli.log(` Kiwix Main Page: ${options.kiwixMainPage}`)
cli.log(` Original Main Page: ${options.mainPage ? decodeURIComponent(options.mainPage) : null}`)
cli.log(` ZIM's Main Page: ${options.kiwixMainPage}`)

if (options.hostingDNSDomain) {
cli.log(` Hosting DNS Domain: ${options.hostingDNSDomain}`)
Expand All @@ -51,7 +52,10 @@ export const zimToWebsite = async (options: Options) => {
await fixExceptions(directories)
insertIndexRedirect(options)
appendJavascript(options, directories)
await generateMainPage(options, directories)
await useKiwixLandingPage(options, directories)

// usually main page from kiwix is ok, so we dont need below
// await generateMainPage(options, directories)

cli.log('done')
}
4 changes: 2 additions & 2 deletions tools/getzim.sh
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ cmd_download_url() {
log "URL: $URL"

# below is a mixture of https://stackoverflow.com/a/19841872/3990041, my knowledge and guesswork :P
SHA256=$(curl -sI "$URL" | grep digest | grep "SHA-256" | sed "s|digest: SHA-256=||g" | base64 -d | od -t x1 -An | tr "\n" " " | sed "s| ||g")
SHA256=$(curl -sI "$URL" | grep digest | grep "SHA-256" | sed "s|digest: SHA-256=||g" | base64 -d -i | od -t x1 -An | tr "\n" " " | sed "s| ||g")

log "SHA256: $SHA256"
}
Expand All @@ -257,7 +257,7 @@ cmd_download() {

dl_cycle() {
log "Downloading $OUTNAME..."
wget --continue -P ./snapshots "$URL"
wget --continue -q --show-progress --progress=bar:force -P ./snapshots "$URL"
return $?
}

Expand Down

0 comments on commit e3f02a7

Please sign in to comment.