diff --git a/README.md b/README.md index ff4fc43..753001e 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,7 @@ CLI to generate XML sitemaps for static sites from local filesystem Options: -b, --base base URL (required) -r, --root root working directory (default: ".") + -m, --match globs to match (default: ["**/*.html"]) -i, --ignore globs to ignore (default: ["404.html"]) -c, --changefreq comma-separated glob-changefreq pairs -p, --priority comma-separated glob-priority pairs @@ -59,7 +60,7 @@ Options: #### HTML parsing -By default, all matched files are piped through a fast +By default, all matched `.html` files are piped through a fast [HTML parser](https://github.com/fb55/htmlparser2) to detect if the `noindex` [meta tag](https://developers.google.com/search/docs/advanced/crawling/block-indexing#meta-tag) is set - typically in the form of `` - in which case that file @@ -99,13 +100,13 @@ Disabled by default; pass option `--slash` to enable. [always added](https://github.com/zerodevx/static-sitemap-cli/tree/v1#to-slash-or-not-to-slash) to root domains. -#### Ignore some files +#### Match or ignore files -The `-i` flag allows multiple entries. By default, it's set to the `["404.html"]`. Change the glob -ignore patterns to suit your use-case like so: +The `-m` and `-i` flags allow multiple entries. By default, they are set to the `["**/*.html"]` and +`["404.html"]` respectively. Change the glob patterns to suit your use-case like so: ``` -$ sscli ... -i '404.html' '**/ignore/**' 'this/other/specific/file.html' +$ sscli ... -m '**/*.{html,jpg,png}' -i '404.html' 'ignore/**' 'this/other/specific/file.html' ``` #### Glob-[*] pairs @@ -143,7 +144,13 @@ $ sscli -b https://x.com -r dist -f xml -o > www/sm.xml #### Get subset of a directory ``` -$ sscli -b https://x.com/foo -r dist/foo -f txt -o > dist/sitemap.txt +$ sscli -b https://x.com/foo -r dist/foo -f xml -o > dist/sitemap.xml +``` + +#### Generate TXT sitemap for image assets + +``` +$ sscli -b https://x.com -r dist -m '**/*.{jpg,jpeg,gif,png,bmp,webp,svg}' -f txt ``` ## Programmatic Use @@ -160,6 +167,7 @@ import { const options = { base: 'https://x.com', root: 'path/to/root', + match: ['**/*html'], ignore: ['404.html'], changefreq: [], priority: [], diff --git a/package.json b/package.json index 63bafb1..9bdb54f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "static-sitemap-cli", - "version": "2.0.1", + "version": "2.1.0", "description": "CLI to generate XML sitemaps for static sites from local filesystem", "author": "Jason Lee ", "type": "module", @@ -38,10 +38,7 @@ ], "license": "ISC", "homepage": "https://npmjs.com/package/static-sitemap-cli", - "repository": { - "type": "git", - "url": "https://github.com/zerodevx/static-sitemap-cli.git" - }, + "repository": "github:zerodevx/static-sitemap-cli", "keywords": [ "sscli", "sitemap", diff --git a/src/cli.js b/src/cli.js index 93cd30b..8c45542 100644 --- a/src/cli.js +++ b/src/cli.js @@ -16,6 +16,7 @@ program .description('CLI to generate XML sitemaps for static sites from local filesystem') .option('-b, --base ', 'base URL (required)') .option('-r, --root ', 'root working directory', '.') + .option('-m, --match ', 'globs to match', ['**/*.html']) .option('-i, --ignore ', 'globs to ignore', ['404.html']) .option('-c, --changefreq ', 'comma-separated glob-changefreq pairs') .option('-p, --priority ', 'comma-separated glob-priority pairs') diff --git a/src/index.js b/src/index.js index b24aae2..09a99be 100644 --- a/src/index.js +++ b/src/index.js @@ -10,8 +10,8 @@ function log(msg) { console.warn('\x1b[36m%s\x1b[0m', `[sscli] ${msg}`) } -async function getFiles({ root, ignore, verbose }) { - const files = await fastglob('**/*.html', { cwd: root, stats: true, ignore }) +async function getFiles({ root, match, ignore, verbose }) { + const files = await fastglob(match, { cwd: root, stats: true, ignore }) if (!files.length) { throw new Error('NO_MATCHES') } @@ -43,16 +43,18 @@ function detectNoindex(path) { } async function transformUrl( - file, + { path, stats: { mtime } }, { root, base, changefreq, priority, robots, clean, slash, verbose } ) { - if (robots) { - if (await detectNoindex(nodepath.join(root, file.path))) { - if (verbose) log(`noindex: ${file.path}`) - return - } + if ( + robots && + nodepath.extname(path) === '.html' && + (await detectNoindex(nodepath.join(root, path))) + ) { + if (verbose) log(`noindex: ${path}`) + return } - let url = base + file.path.split(nodepath.sep).join('/') + let url = base + path.split(nodepath.sep).join('/') if (clean) { if (url.slice(-11) === '/index.html') url = url.slice(0, -11) else if (url.slice(-5) === '.html') url = url.slice(0, -5) @@ -61,12 +63,12 @@ async function transformUrl( const check = (pairs, tagname) => { for (let a = pairs.length - 1; a >= 0; a--) { const p = pairs[a].split(',') - if (micromatch.isMatch(file.path, p[0])) return { [tagname]: p[1] } + if (micromatch.isMatch(path, p[0])) return { [tagname]: p[1] } } } return { loc: url, - lastmod: file.stats.mtime.toISOString(), + lastmod: mtime.toISOString(), ...(changefreq && changefreq.length && check(changefreq, 'changefreq')), ...(priority && priority.length && check(priority, 'priority')) }