From c4ce2a5c4c3726d6bdd79ab2836204d5cf14738b Mon Sep 17 00:00:00 2001
From: Andrew Dupont <github@andrewdupont.net>
Date: Sat, 30 Nov 2024 16:29:21 -0800
Subject: [PATCH] =?UTF-8?q?[spell-check]=20Allow=20the=20user=20to=20white?=
 =?UTF-8?q?list=20sections=20of=20a=20buffer=E2=80=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…for spellchecking on a per-language basis.

This opens up things that aren't really possible with the `excludedScopes` setting. For instance: you can specify `source.python comment, source.python string, source.js comment`; this will enable spellchecking in Python files for both comments and strings, while enabling spellchecking in JavaScript files for _only_ comments.

Closes #1135.
---
 packages/spell-check/lib/main.js              |  20 +--
 packages/spell-check/lib/scope-helper.js      | 155 ++++++++++++++++++
 packages/spell-check/lib/spell-check-view.js  | 121 +++++++++++++-
 packages/spell-check/package.json             |   5 +-
 packages/spell-check/spec/spell-check-spec.js | 125 +++++++++++++-
 5 files changed, 396 insertions(+), 30 deletions(-)
 create mode 100644 packages/spell-check/lib/scope-helper.js

diff --git a/packages/spell-check/lib/main.js b/packages/spell-check/lib/main.js
index df66c98767..8803a23c25 100644
--- a/packages/spell-check/lib/main.js
+++ b/packages/spell-check/lib/main.js
@@ -42,23 +42,9 @@ module.exports = {
         // Hook up changes to the configuration settings.
         this.excludedScopeRegexLists = [];
         this.subs.add(
-            atom.config.observe(
-                'spell-check.excludedScopes',
-                (excludedScopes) => {
-                    this.excludedScopeRegexLists = excludedScopes.map(
-                        (excludedScope) =>
-                            excludedScope
-                                .split(/\s+/)[0]
-                                .split('.')
-                                .filter((className) => className)
-                                .map(
-                                    (className) =>
-                                        new RegExp(`\\b${className}\\b`)
-                                )
-                    );
-                    return this.updateViews();
-                }
-            )
+            atom.config.onDidChange('spell-check.excludedScopes', () => {
+                return this.updateViews();
+            })
         );
 
         this.subs.add(
diff --git a/packages/spell-check/lib/scope-helper.js b/packages/spell-check/lib/scope-helper.js
new file mode 100644
index 0000000000..f43135e607
--- /dev/null
+++ b/packages/spell-check/lib/scope-helper.js
@@ -0,0 +1,155 @@
+function normalizeSegment(segment) {
+    if (!segment.startsWith('.')) return segment;
+    return segment.substring(1);
+}
+
+function segmentsMatch(
+    descriptorSegment,
+    selectorSegment,
+    { enforceSegmentOrder = false } = {}
+) {
+    let descriptorParts = normalizeSegment(descriptorSegment).split('.');
+    let selectorParts = normalizeSegment(selectorSegment).split('.');
+
+    if (selectorParts.length > descriptorParts.length) {
+        return false;
+    }
+
+    // Remove all parts from the descriptor scope name that aren't present in the
+    // selector scope name.
+    for (let i = descriptorParts.length - 1; i >= 0; i--) {
+        let part = descriptorParts[i];
+        if (!selectorParts.includes(part)) {
+            descriptorParts.splice(i, 1);
+        }
+    }
+    // Does order matter? It would if this were a TextMate scope, but Atom has
+    // broadly treated `.function.entity` as equivalent to `.entity.function`,
+    // even though it causes headaches in some places.
+    //
+    // We'll assume that order doesn't matter, but the user can opt into strict
+    // ordering if they want.
+    if (!enforceSegmentOrder) {
+        descriptorParts.sort();
+        selectorParts.sort();
+    }
+    return descriptorParts.join('.') === selectorParts.join('.');
+}
+
+class ScopeSelector {
+    static create(stringOrScopeSelector) {
+        if (typeof stringOrScopeSelector === 'string') {
+            return new ScopeSelector(stringOrScopeSelector);
+        } else if (stringOrScopeSelector instanceof ScopeSelector) {
+            return stringOrScopeSelector;
+        } else {
+            throw new TypeError(`Invalid argument`);
+        }
+    }
+
+    constructor(selectorString) {
+        this.selectorString = selectorString;
+        this.variations = selectorString.split(/,\s*/);
+    }
+
+    matches(scopeDescriptorOrArray, rawOptions = {}) {
+        let options = {
+            // Whether to treat (e.g.) `.function.entity` as distinct from
+            // `.entity.function`. Defaults to `false` to match prevailing Atom
+            // behavior.
+            enforceSegmentOrder: false,
+            ...rawOptions,
+        };
+        console.log(this, 'matches', scopeDescriptorOrArray);
+        let scopeList;
+        if (Array.isArray(scopeDescriptorOrArray)) {
+            scopeList = scopeDescriptorOrArray;
+        } else {
+            scopeList = scopeDescriptorOrArray.getScopesArray();
+        }
+
+        return this.variations.some((variation) =>
+            this.matchesVariation(scopeList, variation, options)
+        );
+    }
+
+    matchesVariation(scopeList, selectorString, options) {
+        let parts = selectorString.split(/\s+/);
+        if (parts.length > scopeList.length) return false;
+
+        let lastIndex = -1;
+
+        outer: for (let selectorPart of parts) {
+            // Find something in the descriptor that matches this selector part.
+            for (let [i, descriptorPart] of scopeList.entries()) {
+                // Ignore everything before our index cursor; this is what enforces the
+                // ordering of the scope selector.
+                if (i <= lastIndex) continue;
+                let doesMatch = segmentsMatch(
+                    descriptorPart,
+                    selectorPart,
+                    options
+                );
+                if (doesMatch) {
+                    lastIndex = i;
+                    continue outer;
+                }
+            }
+            // If we get this far, we searched the entire descriptor list for a
+            // selector part and failed to find it; hence this variation doesn't
+            // match.
+            return false;
+        }
+        // If we get this far, we made it through the entire gauntlet without
+        // hitting the early return. This variation matches!
+        return true;
+    }
+}
+
+// Private: A candidate for possible addition to the {ScopeDescriptor} API.
+//
+// Tests whether the given scope descriptor matches the given scope selector.
+//
+// A subset of the full TextMate scope selector syntax is supported:
+//
+// * Descendant scopes (e.g., `source.python string`); this function will
+//   enforce the ordering of segments.
+// * Variations (e.g., `comment.block, string.quoted`); this function will
+//   return true if any of the variations match.
+//
+// Not supported:
+//
+// * Subtraction syntax (e.g., `comment - block`).
+// * Left/right edge syntax (e.g., `L:comment.block`).
+//
+// For example: given the scope descriptor…
+//
+// [
+//   'source.js',
+//   'meta.block.function.js',
+//   'string.quoted.single.js'
+// ]
+//
+// …here are outcomes of various tests:
+//
+// scopeDescriptorMatchesSelector(descriptor, `source`) // -> true
+// scopeDescriptorMatchesSelector(descriptor, `text`) // -> false
+// scopeDescriptorMatchesSelector(descriptor, `source.js`) // -> true
+// scopeDescriptorMatchesSelector(descriptor, `source.python`) // -> false
+// scopeDescriptorMatchesSelector(descriptor, `source.js meta.block.function.js`) // -> true
+// scopeDescriptorMatchesSelector(descriptor, `source meta`) // -> true
+// scopeDescriptorMatchesSelector(descriptor, `source meta.block.class`) // -> false
+// scopeDescriptorMatchesSelector(descriptor, `source meta string`) // -> true
+// scopeDescriptorMatchesSelector(descriptor, `source string`) // -> true
+// scopeDescriptorMatchesSelector(descriptor, `source string meta`) // -> false
+//
+// - `scopeDescriptor` A {ScopeDescriptor} or a scope descriptor {Array}.
+// - `selector` A {String} representing a scope selector.
+function scopeDescriptorMatchesSelector(scopeDescriptor, selector) {
+    let scopeSelector = ScopeSelector.create(selector);
+    return scopeSelector.matches(scopeDescriptor);
+}
+
+module.exports = {
+    scopeDescriptorMatchesSelector,
+};
diff --git a/packages/spell-check/lib/spell-check-view.js b/packages/spell-check/lib/spell-check-view.js
index c6545bf810..f995c5e8e5 100644
--- a/packages/spell-check/lib/spell-check-view.js
+++ b/packages/spell-check/lib/spell-check-view.js
@@ -2,6 +2,18 @@ let SpellCheckView;
 const _ = require('underscore-plus');
 const { CompositeDisposable } = require('atom');
 const SpellCheckTask = require('./spell-check-task');
+const { scopeDescriptorMatchesSelector } = require('./scope-helper');
+
+// Tests whether a grammar's root scope matches a scope specified in the
+// `grammars` setting. Allows for a more generic name in the setting (e.g.,
+// `source` to match all `source.[x]` grammars).
+function topLevelScopeMatches(grammar, scope) {
+    if (scope === grammar) return true;
+    if (grammar.startsWith(`${scope}.`)) {
+        return true;
+    }
+    return false;
+}
 
 let CorrectionsView = null;
 
@@ -73,6 +85,15 @@ module.exports = SpellCheckView = class SpellCheckView {
             })
         );
 
+        this.disposables.add(
+            atom.config.observe(
+                'spell-check.excludedScopes',
+                (excludedScopes) => {
+                    this.excludedScopes = excludedScopes;
+                }
+            )
+        );
+
         this.subscribeToBuffer();
 
         this.disposables.add(this.editor.onDidDestroy(this.destroy.bind(this)));
@@ -118,6 +139,7 @@ module.exports = SpellCheckView = class SpellCheckView {
         this.unsubscribeFromBuffer();
 
         if (this.spellCheckCurrentGrammar()) {
+            this.scopesToSpellCheck = this.getSpellCheckScopesForCurrentGrammar();
             this.buffer = this.editor.getBuffer();
             this.bufferDisposable = new CompositeDisposable(
                 this.buffer.onDidStopChanging(
@@ -131,7 +153,69 @@ module.exports = SpellCheckView = class SpellCheckView {
 
     spellCheckCurrentGrammar() {
         const grammar = this.editor.getGrammar().scopeName;
-        return _.contains(atom.config.get('spell-check.grammars'), grammar);
+        let grammars = atom.config.get('spell-check.grammars');
+        let topLevelScopes = grammars.map((rawScope) => {
+            if (!rawScope.includes(' ')) return rawScope;
+            return rawScope.substring(0, rawScope.indexOf(' '));
+        });
+        return topLevelScopes.some((scope) => {
+            return topLevelScopeMatches(grammar, scope);
+        });
+        // return topLevelScopes.includes(grammar);
+    }
+
+    // Returns:
+    //
+    // * `true` if the entire buffer should be checked;
+    // * `false` if none of the buffer should be checked; or, if only certain
+    //   parts of the buffer should be checked,
+    // * an {Array} of scope names matching regions of the buffer that should
+    //   be checked.
+    getSpellCheckScopesForCurrentGrammar() {
+        const grammar = this.editor.getGrammar().scopeName;
+        let grammars = atom.config.get('spell-check.grammars');
+        let scopeList = [];
+        // Despite the name of this setting, spell-checking is no longer all or
+        // nothing on a per-grammar basis; we now allow users to opt into
+        // checking subsections of a buffer by adding descendant scopes. Each
+        // segment must begin with all or part of a root scope name (e.g.,
+        // `source.js`, `text.html`, but otherwise any valid scope selector is
+        // accepted here.)
+        //
+        // Examples:
+        //
+        // * `source.js comment.block`
+        // * `source comment, source string.quoted`
+        // * `text`
+        //
+        // The first example targets just JS block comments; the second targets
+        // all comments and quoted strings in _all_ source files; and the third
+        // targets any text format, whether HTML or Markdown or plaintext.
+        //
+        // This allows for more granular spell-checking than was possible
+        // before, even if the `excludeScopes` setting was utilized.
+        for (let rawScope of grammars) {
+            if (!rawScope.includes(' ')) {
+                // Any value that's just the bare root scope of the language
+                // (like `source.python`) means that we're spell-checking the
+                // entire buffer. This applies even if there's a later match
+                // for this grammar that's more restrictive.
+                if (topLevelScopeMatches(grammar, rawScope)) {
+                    return true;
+                }
+            } else {
+                // If the value also includes a descendant scope, it means we're
+                // spell-checking some subset of the buffer.
+                let index = rawScope.indexOf(' ');
+                let rootScope = rawScope.substring(0, index);
+                if (topLevelScopeMatches(grammar, rootScope)) {
+                    // There could be multiple of these — e.g., `source.python string,
+                    // source.python comment` — so we won't return early.
+                    scopeList.push(rawScope);
+                }
+            }
+        }
+        return scopeList.length > 0 ? scopeList : false;
     }
 
     destroyMarkers() {
@@ -147,6 +231,9 @@ module.exports = SpellCheckView = class SpellCheckView {
                 const scope = this.editor.scopeDescriptorForBufferPosition(
                     misspelling[0]
                 );
+                // Under the hood, we spell-check the entire document; but we
+                // might end up ignoring some of the misspellings based on
+                // their scope descriptor.
                 if (!this.scopeIsExcluded(scope)) {
                     result.push(
                         this.markerLayer.markBufferRange(misspelling, {
@@ -308,11 +395,31 @@ module.exports = SpellCheckView = class SpellCheckView {
         return (this.spellCheckModule.contextMenuEntries = []);
     }
 
-    scopeIsExcluded(scopeDescriptor, excludedScopes) {
-        return this.spellCheckModule.excludedScopeRegexLists.some((regexList) =>
-            scopeDescriptor.scopes.some((scopeName) =>
-                regexList.every((regex) => regex.test(scopeName))
-            )
-        );
+    scopeIsExcluded(scopeDescriptor) {
+        // Practically speaking, `this.scopesToSpellCheck` will either be `true`
+        // or an array of scope selectors. If it's the latter, then we should
+        // apply whitelisting and exclude anything that doesn't match.
+        if (Array.isArray(this.scopesToSpellCheck)) {
+            // If we know none of the subscopes match this region, we can
+            // exclude it even before we get to the `excludedScopes` setting.
+            let someMatch = this.scopesToSpellCheck.some(
+                (scopeToSpellCheck) => {
+                    return scopeDescriptorMatchesSelector(
+                        scopeDescriptor,
+                        scopeToSpellCheck
+                    );
+                }
+            );
+            if (!someMatch) return true;
+        }
+        // Whether or not we applied whitelisting above, excluded scopes take
+        // precedence; anything that doesn't make it through this gauntlet
+        // gets excluded.
+        return this.excludedScopes.some((excludedScope) => {
+            return scopeDescriptorMatchesSelector(
+                scopeDescriptor,
+                excludedScope
+            );
+        });
     }
 };
diff --git a/packages/spell-check/package.json b/packages/spell-check/package.json
index 61ac11aa6d..862fb9f1b4 100644
--- a/packages/spell-check/package.json
+++ b/packages/spell-check/package.json
@@ -20,7 +20,8 @@
   "repository": "https://github.com/pulsar-edit/pulsar",
   "license": "MIT",
   "engines": {
-    "atom": "*"
+    "atom": "*",
+    "node": ">=14"
   },
   "scripts": {
     "format": "prettier --write \"spec/*.js\" \"lib/**/*.js\" \"script/*.js\" --loglevel warn"
@@ -37,7 +38,7 @@
         "source.rst",
         "text.restructuredtext"
       ],
-      "description": "List of scopes for languages which will be checked for misspellings. See [the README](https://github.com/pulsar-edit/pulsar/blob/master/packages/spell-check/README.md) for more information on finding the correct scope for a specific language.",
+      "description": "List of scopes for languages which will be checked for misspellings. See [the README](https://github.com/pulsar-edit/pulsar/blob/master/packages/spell-check/README.md) for more information on finding the correct scope for a specific language. If you want to spell-check only parts of some languages, you may specify a more detailed scope selector (like `source.js comment.block` or `source string`); but the first part of the selector must match the language scope.",
       "order": 1
     },
     "excludedScopes": {
diff --git a/packages/spell-check/spec/spell-check-spec.js b/packages/spell-check/spec/spell-check-spec.js
index cdc32c18f9..4c74200dfd 100644
--- a/packages/spell-check/spec/spell-check-spec.js
+++ b/packages/spell-check/spec/spell-check-spec.js
@@ -47,8 +47,8 @@ describe('Spell check', function () {
     });
 
     afterEach(async () => {
-      await languageMode.atTransactionEnd();
-      SpellCheckTask.clear();
+        await languageMode.atTransactionEnd();
+        SpellCheckTask.clear();
     });
 
     it('decorates all misspelled words', async function () {
@@ -77,9 +77,124 @@ describe('Spell check', function () {
         expect(textForMarker(misspellingMarkers[1])).toEqual('bok');
     });
 
+    it('allows certain sub-scopes to be whitelisted into spell checking, implicitly excluding anything that does not match', async () => {
+        editor.setText(
+            `speledWrong = 5;
+function speledWrong() {}
+// We only care about mispelings in comments and strings!
+let foo = "this is speled wrong"
+class SpeledWrong {}`
+        );
+
+        atom.config.set('spell-check.useLocales', true);
+        atom.config.set('spell-check.grammars', [
+            'source.js comment',
+            'source.js string',
+            'text.plain.null-grammar',
+        ]);
+
+        {
+            await conditionPromise(() => getMisspellingMarkers().length > 0);
+            const markers = getMisspellingMarkers();
+            expect(markers.map((marker) => marker.getBufferRange())).toEqual([
+                [
+                    [2, 22],
+                    [2, 32],
+                ],
+                [
+                    [3, 19],
+                    [3, 25],
+                ],
+            ]);
+        }
+    });
+
+    it('interprets a bare root scope as opting out of scope whitelisting, even when other more specific segments are present', async () => {
+        editor.setText(
+            `speledWrong = 5;
+function speledWrong() {}
+// We only care about mispelings in comments and strings!
+let foo = "this is speled wrong"
+class SpeledWrong {}`
+        );
+
+        atom.config.set('spell-check.useLocales', true);
+        atom.config.set('spell-check.grammars', [
+            // Exactly as above, but with an extra `'source.js'` listing; this will
+            // supersede the more specific settings below.
+            'source.js',
+            'source.js comment',
+            'source.js string',
+            'text.plain.null-grammar',
+        ]);
+
+        {
+            await conditionPromise(() => getMisspellingMarkers().length > 0);
+            const markers = getMisspellingMarkers();
+            expect(markers.map((marker) => marker.getBufferRange())).toEqual([
+                [
+                    [0, 0],
+                    [0, 11],
+                ],
+                [
+                    [1, 9],
+                    [1, 20],
+                ],
+                [
+                    [2, 22],
+                    [2, 32],
+                ],
+                [
+                    [3, 4],
+                    [3, 7],
+                ],
+                [
+                    [3, 19],
+                    [3, 25],
+                ],
+                [
+                    [4, 6],
+                    [4, 17],
+                ],
+            ]);
+        }
+    });
+
+    it('allows a generic root scope like "source"', async () => {
+        editor.setText(
+            `speledWrong = 5;
+function speledWrong() {}
+// We only care about mispelings in comments and strings!
+let foo = "this is speled wrong"
+class SpeledWrong {}`
+        );
+
+        atom.config.set('spell-check.useLocales', true);
+        atom.config.set('spell-check.grammars', [
+            'source comment',
+            'source string',
+            'text.plain.null-grammar',
+        ]);
+
+        {
+            await conditionPromise(() => getMisspellingMarkers().length > 0);
+            const markers = getMisspellingMarkers();
+            expect(markers.map((marker) => marker.getBufferRange())).toEqual([
+                [
+                    [2, 22],
+                    [2, 32],
+                ],
+                [
+                    [3, 19],
+                    [3, 25],
+                ],
+            ]);
+        }
+    });
+
     it('allows certain scopes to be excluded from spell checking', async function () {
         editor.setText(
-`speledWrong = 5;
+            `speledWrong = 5;
 function speledWrong() {}
 class SpeledWrong {}`
         );
@@ -126,7 +241,9 @@ class SpeledWrong {}`
         }
 
         {
-            atom.config.set('spell-check.excludedScopes', ['.entity.name.type.class']);
+            atom.config.set('spell-check.excludedScopes', [
+                '.entity.name.type.class',
+            ]);
             await conditionPromise(() => getMisspellingMarkers().length === 2);
             const markers = getMisspellingMarkers();
             expect(markers.map((marker) => marker.getBufferRange())).toEqual([