Skip to content

Commit

Permalink
Speed up wiki scraping (#42)
Browse files Browse the repository at this point in the history
* Scrape 20 pages at a time

This significantly speeds up wiki scraping

* Wipe output folder by default

Is there a reason not to do this?
  • Loading branch information
robotboy655 authored Feb 15, 2024
1 parent 4356d7c commit 5213c35
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"scripts": {
"clear-output": "rm -rf ./output/",
"wiki-check-changed": "tsx ./src/cli-change-checker.ts",
"scrape-wiki": "tsx ./src/cli-scraper.ts --output ./output/ --customOverrides ./custom/",
"scrape-wiki": "tsx ./src/cli-scraper.ts --output ./output/ --customOverrides ./custom/ --wipe",
"pack-release": "tsx ./src/cli-release-packer.ts --input ./output/ --output ./dist/release/",
"publish-library": "tsx ./src/cli-library-publisher.ts --input ./output/ --output ./dist/libraries/garrysmod",
"stylua-custom": "npx --yes @johnnymorganz/[email protected] ./custom",
Expand Down
9 changes: 8 additions & 1 deletion src/cli-scraper.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ async function startScrape() {

const pageIndexes = await scrapeAndCollect(pageListScraper);

let queue: Promise<any>[] = [];
for (const pageIndex of pageIndexes) {
const pageMarkupScraper = new WikiPageMarkupScraper(`${baseUrl}/${pageIndex.address}?format=text`);

Expand Down Expand Up @@ -119,7 +120,13 @@ async function startScrape() {
fs.writeFileSync(path.join(baseDirectory, moduleName, `${fileName}.json`), json);
});

await pageMarkupScraper.scrape();
queue.push(pageMarkupScraper.scrape());

if (queue.length > 20)
{
await Promise.allSettled(queue);
queue = [];
}
}

console.log(`Done with scraping! You can find the output in ${baseDirectory}`);
Expand Down

0 comments on commit 5213c35

Please sign in to comment.