From 6edfccc34f76bb3d6fdd420592a5f54fccbd6a62 Mon Sep 17 00:00:00 2001 From: luttje <2738114+luttje@users.noreply.github.com> Date: Mon, 1 Jan 2024 19:48:49 +0100 Subject: [PATCH] Prototype #8 --- .github/workflows/release.yml | 19 ++- package-lock.json | 90 ++++++++++++++ package.json | 1 + src/cli-scraper.ts | 169 ++++++++++++++++++++------- src/scrapers/wiki-history-scraper.ts | 2 +- src/utils/filesystem.ts | 28 +++++ 6 files changed, 262 insertions(+), 47 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 84f47f4..1f89d48 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -3,8 +3,18 @@ on: push: branches: - main + paths: + - 'src/**' + - 'custom/**' schedule: - cron: '0 0 1 * *' + workflow_dispatch: + inputs: + fullScrape: + description: 'Scrape the entire wiki, instead of only the changed pages' + required: true + default: false + type: boolean jobs: release: permissions: write-all @@ -32,7 +42,14 @@ jobs: uses: montudor/action-zip@v1 - name: Scrape wiki if: ${{ steps.wiki_confirm_no_changes.outcome == 'failure' }} - run: npm run scrape-wiki + env: + FULL_SCRAPE: ${{ inputs.fullScrape || false }} + run: | + if [ "$FULL_SCRAPE" = "true" ]; then + npm run wiki-scrape + else + npm run wiki-scrape -- --update-changed-by-tag $(git tag -l --sort=-v:refname | head -n 1) + fi - name: Format the output with StyLua if: ${{ steps.wiki_confirm_no_changes.outcome == 'failure' }} uses: JohnnyMorganz/stylua-action@v2.0.0 diff --git a/package-lock.json b/package-lock.json index 7ec5885..1cd7e0a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -15,6 +15,7 @@ "archiver": "^6.0.1", "cheerio": "^1.0.0-rc.12", "cross-env": "^7.0.3", + "extract-zip": "^2.0.1", "fetch-retry": "^5.0.6", "html-loader": "^4.2.0", "jest": "^29.7.0", @@ -1660,6 +1661,16 @@ "integrity": "sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==", "dev": true }, + "node_modules/@types/yauzl": { + "version": "2.10.3", + "resolved": "https://registry.npmjs.org/@types/yauzl/-/yauzl-2.10.3.tgz", + "integrity": "sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==", + "dev": true, + "optional": true, + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@webassemblyjs/ast": { "version": "1.11.6", "resolved": "https://registry.npmjs.org/@webassemblyjs/ast/-/ast-1.11.6.tgz", @@ -2890,6 +2901,15 @@ "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", "dev": true }, + "node_modules/end-of-stream": { + "version": "1.4.4", + "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz", + "integrity": "sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==", + "dev": true, + "dependencies": { + "once": "^1.4.0" + } + }, "node_modules/enhanced-resolve": { "version": "5.15.0", "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.15.0.tgz", @@ -3135,6 +3155,41 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/extract-zip": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz", + "integrity": "sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==", + "dev": true, + "dependencies": { + "debug": "^4.1.1", + "get-stream": "^5.1.0", + "yauzl": "^2.10.0" + }, + "bin": { + "extract-zip": "cli.js" + }, + "engines": { + "node": ">= 10.17.0" + }, + "optionalDependencies": { + "@types/yauzl": "^2.9.1" + } + }, + "node_modules/extract-zip/node_modules/get-stream": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz", + "integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==", + "dev": true, + "dependencies": { + "pump": "^3.0.0" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/fast-deep-equal": { "version": "3.1.3", "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", @@ -3163,6 +3218,15 @@ "bser": "2.1.1" } }, + "node_modules/fd-slicer": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz", + "integrity": "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==", + "dev": true, + "dependencies": { + "pend": "~1.2.0" + } + }, "node_modules/fetch-retry": { "version": "5.0.6", "resolved": "https://registry.npmjs.org/fetch-retry/-/fetch-retry-5.0.6.tgz", @@ -4969,6 +5033,12 @@ "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", "dev": true }, + "node_modules/pend": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", + "integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==", + "dev": true + }, "node_modules/picocolors": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz", @@ -5065,6 +5135,16 @@ "integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag==", "dev": true }, + "node_modules/pump": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", + "integrity": "sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==", + "dev": true, + "dependencies": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, "node_modules/punycode": { "version": "2.3.1", "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", @@ -6253,6 +6333,16 @@ "node": ">=12" } }, + "node_modules/yauzl": { + "version": "2.10.0", + "resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz", + "integrity": "sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==", + "dev": true, + "dependencies": { + "buffer-crc32": "~0.2.3", + "fd-slicer": "~1.1.0" + } + }, "node_modules/yn": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", diff --git a/package.json b/package.json index 8d2038b..d3f6e8d 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,7 @@ "archiver": "^6.0.1", "cheerio": "^1.0.0-rc.12", "cross-env": "^7.0.3", + "extract-zip": "^2.0.1", "fetch-retry": "^5.0.6", "html-loader": "^4.2.0", "jest": "^29.7.0", diff --git a/src/cli-scraper.ts b/src/cli-scraper.ts index f1243f9..d396e8e 100644 --- a/src/cli-scraper.ts +++ b/src/cli-scraper.ts @@ -1,13 +1,15 @@ -import { WikiPageMarkupScraper } from './scrapers/wiki-page-markup-scraper.js'; -import { WikiPageListScraper } from './scrapers/wiki-page-list-scraper.js'; -import packageJson from '../package.json' assert { type: "json" }; +import { Command } from 'commander'; +import { convertWindowsToUnixPath, dateToFilename, saveFile, unzipFiles, walk } from './utils/filesystem.js'; import { GluaApiWriter } from './api-writer/glua-api-writer.js'; +import { RequestInitWithRetry } from 'fetch-retry'; import { scrapeAndCollect } from './scrapers/collector.js'; +import { WikiHistoryPageScraper } from './scrapers/wiki-history-scraper.js'; +import { WikiPageListScraper } from './scrapers/wiki-page-list-scraper.js'; +import { WikiPageMarkupScraper } from './scrapers/wiki-page-markup-scraper.js'; import { writeMetadata } from './utils/metadata.js'; -import { RequestInitWithRetry } from 'fetch-retry'; -import { Command } from 'commander'; -import path from 'path'; import fs from 'fs'; +import packageJson from '../package.json' assert { type: "json" }; +import path from 'path'; async function startScrape() { const program = new Command(); @@ -16,47 +18,98 @@ async function startScrape() { .version(packageJson.version) .description('Scrapes the Garry\'s Mod wiki for API information') .option('-o, --output ', 'The path to the directory where the output json and lua files should be saved', './output') - .option('-u, --url ', 'The pagelist URL of the Garry\'s Mod wiki that holds all pages to scrape', 'https://wiki.facepunch.com/gmod/') .option('-c, --customOverrides [path]', 'The path to a directory containing custom overrides for the API') .option('-w, --wipe', 'Clean the output directory before scraping', false) + .option('-t, --update-changed-by-tag ', 'Use the provided last git tag to only update pages that have changed since the last scrape', false) .parse(process.argv); const options = program.opts(); - - if (!options.url) { - console.error('No URL provided'); - process.exit(1); - } - const baseDirectory = options.output.replace(/\/$/, ''); const customDirectory = options.customOverrides?.replace(/\/$/, '') ?? null; - const baseUrl = options.url.replace(/\/$/, ''); - const pageListScraper = new WikiPageListScraper(`${baseUrl}/~pagelist?format=json`); + const baseUrl = 'https://wiki.facepunch.com/gmod'; const writer = new GluaApiWriter(); - const retryOptions: RequestInitWithRetry = { - retries: 5, - retryDelay: function(attempt, error, response) { - return Math.pow(2, attempt) * 500; // 500, 1000, 2000, 4000, 8000 - } - } - - pageListScraper.setRetryOptions(retryOptions); - - writeMetadata(baseUrl, baseDirectory); - if (options.wipe && fs.existsSync(baseDirectory)) fs.rmSync(baseDirectory, { recursive: true }); if (!fs.existsSync(baseDirectory)) fs.mkdirSync(baseDirectory, { recursive: true }); - if (customDirectory !== null) { - if (!fs.existsSync(customDirectory)) { - console.error(`Custom overrides directory ${customDirectory} does not exist`); + if (customDirectory !== null && !fs.existsSync(customDirectory)) { + console.error(`Custom overrides directory ${customDirectory} does not exist`); + process.exit(1); + } + + function writeLuaFile(moduleName: string, pageMarkups: any[]) { + const api = writer.writePages(pageMarkups); + + const moduleFile = path.join(baseDirectory, moduleName); + + if (!fs.existsSync(`${moduleFile}.lua`)) + fs.writeFileSync(`${moduleFile}.lua`, '---@meta\n\n'); + + fs.appendFileSync(`${moduleFile}.lua`, api); + } + + // If the update-changed-by-tag option is set, download that the json for that tag from the github releases as our starting point + if (options.updateChangedByTag && options.updateChangedByTag !== 'false') { + const url = `https://api.github.com/repos/luttje/glua-api-snippets/releases/tags/${options.updateChangedByTag}`; + const response = await fetch(url); + const json = await response.json(); + const asset = json.assets.find((asset: any) => asset.name === `${options.updateChangedByTag}.json.zip`); + + if (!asset) { + console.error(`No asset found for tag ${options.updateChangedByTag}`); + process.exit(1); + } + + const zipUrl = asset.browser_download_url; + const zipResponse = await fetch(zipUrl); + + if (!zipResponse.ok || zipResponse.body === null) { + console.error(`Failed to download zip file from ${zipUrl}`); process.exit(1); } + const zipFile = path.join(baseDirectory, `${options.updateChangedByTag}.json.zip`); + await saveFile(zipFile, zipResponse.body); + await unzipFiles(zipFile, baseDirectory); + fs.rmSync(zipFile); + const files = walk(baseDirectory); + + // Unzip the starting files so we can use them as a starting point + for (const file of files) { + const fileStat = fs.statSync(file); + + if (fileStat.isDirectory()) { + console.warn(`Skipping directory ${file} in custom (not supported)`); + continue; + } + + if (!file.endsWith('.json') || file.endsWith('__metadata.json')) + continue; + + const fileContent = fs.readFileSync(file, { encoding: 'utf-8' }); + let fileName = convertWindowsToUnixPath(file) + .replace(convertWindowsToUnixPath(baseDirectory + '/') + .replace(/^\.\//, ''), + '' + ) + .replace(/\.json$/, ''); + let moduleName = fileName; + + [moduleName, fileName] = fileName.split(/[:.\/]/, 2); + + const pageMarkups = JSON.parse(fileContent); + writeLuaFile(moduleName, pageMarkups); + } + } + + writeMetadata(baseUrl, baseDirectory); + + // Copy all files from the custom directory to the output directory. + // This allows us to override the output of the scraper with custom (hard-coded) files. + if (customDirectory !== null) { const files = fs.readdirSync(customDirectory); for (const file of files) { @@ -80,18 +133,49 @@ async function startScrape() { } } - const pageIndexes = await scrapeAndCollect(pageListScraper); + let pageAddresses: string[] = []; + + // If the update-changed-by-tag option is not set (or its false), scrape all pages on the wiki + if (!options.updateChangedByTag || options.updateChangedByTag === 'false') { + const pageListScraper = new WikiPageListScraper(`${baseUrl}/~pagelist?format=json`); + const retryOptions: RequestInitWithRetry = { + retries: 5, + retryDelay: function (attempt, error, response): number { + return Math.pow(2, attempt) * 500; // 500, 1000, 2000, 4000, 8000 + } + }; + pageListScraper.setRetryOptions(retryOptions); + + const pageIndexes = await scrapeAndCollect(pageListScraper); + + pageAddresses = pageIndexes.map((pageIndex) => { + return pageIndex.address; + }); + } else { + // Go through all history pages and add them to the list of pages to be scraped. + // Stop when we reach a page that has already been scraped. + const historyResult = (await scrapeAndCollect(new WikiHistoryPageScraper(`${baseUrl}/~recentchanges`)))[0]; + pageAddresses = []; + + for (const history of historyResult.history) { + const tagName = dateToFilename(history.dateTime); + + if (options.updateChangedByTag === tagName) + break; + + const address = history.url.replace(/^\/gmod\//, ''); + pageAddresses.push(address); + } + } - for (const pageIndex of pageIndexes) { - const pageMarkupScraper = new WikiPageMarkupScraper(`${baseUrl}/${pageIndex.address}?format=text`); + for (const pageAddress of pageAddresses) { + const pageMarkupScraper = new WikiPageMarkupScraper(`${baseUrl}/${pageAddress}?format=text`); pageMarkupScraper.on('scraped', (url, pageMarkups) => { if (pageMarkups.length === 0) return; - const api = writer.writePages(pageMarkups); - - let fileName = pageIndex.address; + let fileName = pageAddress; let moduleName = fileName; if (fileName.includes('.') || fileName.includes(':') || fileName.includes('/')) { @@ -100,23 +184,18 @@ async function startScrape() { // Make sure modules like Entity and ENTITY are placed in the same file. moduleName = moduleName.toLowerCase(); + fileName = fileName.replace(/[^a-z0-9]/gi, '_').toLowerCase(); - const moduleFile = path.join(baseDirectory, moduleName); + writeLuaFile(moduleName, pageMarkups); - if (!fs.existsSync(`${moduleFile}.lua`)) - fs.writeFileSync(`${moduleFile}.lua`, '---@meta\n\n'); + const moduleFile = path.join(baseDirectory, moduleName); if (!fs.existsSync(moduleFile)) fs.mkdirSync(moduleFile, { recursive: true }); - fileName = fileName.replace(/[^a-z0-9]/gi, '_').toLowerCase(); - - // Lua API - fs.appendFileSync(path.join(baseDirectory, `${moduleName}.lua`), api); - - // JSON data + // Store JSON data so it can be used later to regenerate the lua files const json = JSON.stringify(pageMarkups, null, 2); - fs.writeFileSync(path.join(baseDirectory, moduleName, `${fileName}.json`), json); + fs.writeFileSync(path.join(moduleFile, `${fileName}.json`), json); }); await pageMarkupScraper.scrape(); diff --git a/src/scrapers/wiki-history-scraper.ts b/src/scrapers/wiki-history-scraper.ts index 99d1ed6..63285ad 100644 --- a/src/scrapers/wiki-history-scraper.ts +++ b/src/scrapers/wiki-history-scraper.ts @@ -73,7 +73,7 @@ export class WikiHistoryPageScraper extends PageTraverseScraper dateTime, user, change, - url + url, }); } diff --git a/src/utils/filesystem.ts b/src/utils/filesystem.ts index 889b34a..1e43cb3 100644 --- a/src/utils/filesystem.ts +++ b/src/utils/filesystem.ts @@ -1,3 +1,4 @@ +import extract from 'extract-zip'; import archiver from 'archiver'; import path from 'path'; import fs from 'fs'; @@ -88,3 +89,30 @@ export async function zipFiles(outputFile: string, filePaths: string[], trimPath await archive.finalize(); }); } + +export async function unzipFiles(zipFile: string, outputDirectory: string) { + return new Promise(async (resolve, reject) => { + if (!fs.existsSync(zipFile)) + reject(new Error(`File ${zipFile} does not exist.`)); + + await extract(zipFile, { dir: path.resolve(outputDirectory) }); + + resolve(); + }); +} + +export async function saveFile(file: string, stream: ReadableStream) { + let buffer = new Int8Array(); + const reader = stream.getReader(); + + while (true) { + const { done, value } = await reader.read(); + + if (done) + break; + + buffer = new Int8Array([...buffer, ...value]); + } + + fs.writeFileSync(file, buffer); +}