From 6381bb63ea32af23800463ac41fb003ed9ae52f6 Mon Sep 17 00:00:00 2001 From: Steven Levithan Date: Sat, 9 Nov 2024 19:24:09 +0100 Subject: [PATCH] Improve handling of backrefs to nonparticipating groups --- spec/match-backreference.spec.js | 30 +++++-- src/generate.js | 2 +- src/index.js | 6 +- src/parse.js | 18 ++++- src/transform.js | 130 +++++++++++++++++++++---------- 5 files changed, 132 insertions(+), 54 deletions(-) diff --git a/spec/match-backreference.spec.js b/spec/match-backreference.spec.js index 81dd8a4..db64227 100644 --- a/spec/match-backreference.spec.js +++ b/spec/match-backreference.spec.js @@ -73,6 +73,10 @@ describe('Backreference', () => { expect('').not.toFindMatch(r`(\g<2>(\2))`); }); + it('should not match references to groups not in the alternation path', () => { + expect('').not.toFindMatch(r`(a)|\1`); + }); + // For 1-9, else it becomes octal if not enough groups defined to the left, even if enough // groups defined to the right it('should throw for forward references to defined groups', () => { @@ -158,6 +162,10 @@ describe('Backreference', () => { expect('').not.toFindMatch(r`(\g<2>(\k<2>))`); }); + it('should not match references to groups not in the alternation path', () => { + expect('').not.toFindMatch(r`(a)|\k<1>`); + }); + it('should throw for forward references to defined groups', () => { expect(() => toDetails(r`\k<1>()`)).toThrow(); expect(() => toDetails(r`()\k<2>()`)).toThrow(); @@ -254,6 +262,10 @@ describe('Backreference', () => { expect('').not.toFindMatch(r`(\g<+1>(\k<-1>))`); }); + it('should not match references to groups not in the alternation path', () => { + expect('').not.toFindMatch(r`(a)|\k<-1>`); + }); + it('should throw for forward references to defined groups', () => { expect(() => toDetails(r`\k<+1>()`)).toThrow(); expect(() => toDetails(r`\k'+1'()`)).toThrow(); @@ -367,13 +379,8 @@ describe('Backreference', () => { }); }); - it('should throw for forward references to defined groups', () => { - expect(() => toDetails(r`\k(?)`)).toThrow(); - expect(() => toDetails(r`(?(?)\k)(?)`)).toThrow(); - }); - - it('should throw for forward references to defined groups even when subroutines add captures', () => { - expect(() => toDetails(r`\g\k(?)`)).toThrow(); + it('should not match references to groups not in the alternation path', () => { + expect('').not.toFindMatch(r`(?a)|\k`); }); it('should preclude groups not in the alternation path when multiplexing', () => { @@ -396,6 +403,15 @@ describe('Backreference', () => { maxTestTarget: maxTestTargetForDuplicateNames, }); }); + + it('should throw for forward references to defined groups', () => { + expect(() => toDetails(r`\k(?)`)).toThrow(); + expect(() => toDetails(r`(?(?)\k)(?)`)).toThrow(); + }); + + it('should throw for forward references to defined groups even when subroutines add captures', () => { + expect(() => toDetails(r`\g\k(?)`)).toThrow(); + }); }); }); diff --git a/src/generate.js b/src/generate.js index 5d07185..f5e970f 100644 --- a/src/generate.js +++ b/src/generate.js @@ -20,7 +20,7 @@ function generate(ast, options) { const minTargetEsNext = isMinTarget(opts.target, 'ESNext'); const rDepth = opts.maxRecursionDepth; if (rDepth !== null && (!Number.isInteger(rDepth) || rDepth < 2 || rDepth > 100)) { - throw new Error('Invalid maxRecursionDepth; use null or 2-100'); + throw new Error('Invalid maxRecursionDepth; use 2-100 or null'); } // If the output can't use flag groups, we need a pre-pass to check for the use of chars with diff --git a/src/index.js b/src/index.js index 90161bd..d3f5c27 100644 --- a/src/index.js +++ b/src/index.js @@ -138,9 +138,9 @@ class EmulatedRegExp extends RegExp { @returns {RegExpExecArray | null} */ exec(str) { - // Special case handling that requires coupling with changes for the specific strategy in the - // transformer. These changes add emulation support for some common patterns that are otherwise - // unsupportable. Only one subclass strategy is supported per pattern + // Special case handling that requires coupling with pattern changes for the specific strategy + // in the transformer. These changes add emulation support for some common patterns that are + // otherwise unsupportable. Only one subclass strategy is supported per pattern const useLastIndex = this.global || this.sticky; const pos = this.lastIndex; const exec = RegExp.prototype.exec; diff --git a/src/parse.js b/src/parse.js index 509b233..66f5c53 100644 --- a/src/parse.js +++ b/src/parse.js @@ -194,10 +194,22 @@ function parseBackreference(context) { const fromNum = (num, isRelative = false) => { const numCapturesToLeft = context.capturingGroups.length; let orphan = false; + // Note: It's not an error for numbered backrefs to come before their referenced group in Onig, + // but an error is the best path for this library because: + // 1. Most placements are mistakes and can never match (based on the Onig behavior for backrefs + // to nonparticipating groups). + // 2. Erroring matches the behavior of named backrefs. + // 3. The edge cases where they're matchable rely on rules for backref resetting within + // quantified groups that are different in JS and aren't emulatable. Note that it's not a + // backref in the first place if using `\10` or higher and not as many capturing groups are + // defined to the left (it's an octal or identity escape). + // [TODO] Ideally this would be refactored to include the backref in the AST when it's not an + // error in Onig (due to the reffed group being defined to the right), and the error handling + // would move to the transformer if (num > numCapturesToLeft) { - // [WARNING] Skipping the error messes up assumptions and probably create edge case issues, - // since backrefs are required to come after their captures; unfortunately this option is - // needed for TextMate grammars + // [WARNING] Skipping the error breaks assumptions and might create edge case issues, since + // backrefs are required to come after their captures; unfortunately this option is needed + // for TextMate grammars if (context.skipBackrefValidation) { orphan = true; } else { diff --git a/src/transform.js b/src/transform.js index 67cfb0b..8b374c5 100644 --- a/src/transform.js +++ b/src/transform.js @@ -177,7 +177,7 @@ const FirstPassVisitor = { }[value]; if (negate) { // POSIX classes are always nested in a char class; manually invert the range rather than - // using `[^...]` so it can be unwrapped since ES2018 doesn't support nested classes + // using `[^...]` so it can be unwrapped, since ES2018 doesn't support nested classes ascii = `\0-${cp(ascii.codePointAt(0) - 1)}${cp(ascii.codePointAt(2) + 1)}-\u{10FFFF}`; } replaceWith(parseFragment(`[${ascii}]`)); @@ -192,7 +192,7 @@ const FirstPassVisitor = { node.key = 'sc'; } } else if (kind === AstCharacterSetKinds.space) { - // Unlike JS, Onig's `\s` matches only ASCII space, tab, LF, VT, FF, and CR + // Unlike JS, Onig's `\s` matches only ASCII tab, space, LF, VT, FF, and CR const s = parseFragment('[ \t\n\v\f\r]'); s.negate = negate; replaceWith(s); @@ -408,26 +408,20 @@ const SecondPassVisitor = { // ## Track data for backref multiplexing const multiplexNodes = getOrCreate(multiplexCapturesToLeftByRef, ref, []); for (let i = 0; i < multiplexNodes.length; i++) { - // Captures added via subroutine expansion (possibly indirectly because they were child + // Captures added via subroutine expansion (maybe indirectly because they were descendant // captures of the reffed group or in a nested subroutine expansion) form a set with their // origin group and any other copies of it added via subroutines. Only the most recently // matched within this set is added to backref multiplexing. So search the list of already- // tracked multiplexed nodes for this group name or number to see if there's a node being // replaced by this capture const multiplex = multiplexNodes[i]; - const mpName = multiplex.node.name; if ( // This group is from subroutine expansion, and there's a multiplex value from either the // origin node or a prior subroutine expansion group with the same origin (origin === multiplex.node || (origin && origin === multiplex.origin)) || // This group is not from subroutine expansion, and it comes after a subroutine expansion // group that refers to this group - node === multiplex.origin || - // The multiplex node is a named group that's not in the current alternation path (which - // will mean it's nonparticipating for any following backrefs); remove it from - // multiplexing since backrefs to nonparticipating groups can't match in Onig but match - // the empty string in JS - (mpName && !getOrCreate(namedGroupsInScopeByAlt, parentAlt, new Map()).has(mpName)) + node === multiplex.origin ) { multiplexNodes.splice(i, 1); break; @@ -518,28 +512,25 @@ const ThirdPassVisitor = { state.highestOrphanBackref = Math.max(state.highestOrphanBackref, node.ref); return; } - const refNodes = state.reffedNodesByBackreference.get(node); - const unclosedCaps = getAllParents(node, node => node.type === AstTypes.CapturingGroup); + const reffedNodes = state.reffedNodesByBackreference.get(node); + const participants = reffedNodes.filter(reffedNode => canCaptureParticipateForBackref(reffedNode, node)); // For the backref's `ref`, use `number` rather than `name` because group names might have been // removed if they're duplicates within their alternation path, or they might be removed later // by the generator (depending on target) if they're duplicates within the overall pattern. - // Backrefs must come after groups they ref, so reffed node `number`s are already recalculated. - // Also, convert backrefs to not-yet-closed groups to `(?!)`; they can't match in Onig but - // match the empty string in JS - if (refNodes.length > 1) { - const alts = refNodes.map(reffedGroupNode => adoptAndSwapKids( + // Backrefs must come after groups they ref, so reffed node `number`s are already recalculated + if (!participants.length) { + // If no participating capture, convert backref to to `(?!)`; backrefs to nonparticipating + // groups can't match in Onig but match the empty string in JS + replaceWith(createLookaround({negate: true})); + } else if (participants.length > 1) { + // Multiplex + const alts = participants.map(reffedNode => adoptAndSwapKids( createAlternative(), - [ unclosedCaps.some(capture => capture.number === reffedGroupNode.number) ? - createLookaround({negate: true}) : - createBackreference(reffedGroupNode.number) - ] + [createBackreference(reffedNode.number)] )); replaceWith(adoptAndSwapKids(createGroup(), alts)); } else { - node.ref = refNodes[0].number; - if (unclosedCaps.some(capture => capture.number === node.ref)) { - replaceWith(createLookaround({negate: true})); - } + node.ref = participants[0].number; } }, @@ -559,8 +550,9 @@ const ThirdPassVisitor = { // to be valid in JS. This is needed to support TextMate grammars, which merge `begin` and // `end` patterns. An `end` pattern might therefore have backrefs to a group that doesn't // exist within `end`. This presents a dilemma since both Oniguruma and JS (with flag u or v) - // throw for backrefs to missing captures, and the backref can't be removed or changed. So - // here's a solution. NB: Orphan backrefs are only allowed if the `tmGrammar` option is used + // throw for backrefs to undefined captures, and the backref can't be removed or changed. So + // this is a solution that lets it through. Note: Orphan backrefs are only allowed if the + // `tmGrammar` option is used const numCapsNeeded = Math.max(state.highestOrphanBackref - state.numCapturesToLeft, 0); for (let i = 0; i < numCapsNeeded; i++) { const emptyCapture = createCapturingGroup(); @@ -572,7 +564,7 @@ const ThirdPassVisitor = { function adoptAndSwapKids(parent, kids) { kids.forEach(kid => kid.parent = parent); - parent[getChildContainerAccessor(parent)] = kids; + parent[getContainerAccessor(parent)] = kids; return parent; } @@ -690,6 +682,40 @@ function areFlagsEqual(a, b) { return a.dotAll === b.dotAll && a.ignoreCase === b.ignoreCase; } +function canCaptureParticipateForBackref(capture, backref) { + // Walks to the left (prev siblings), down (sibling descendants), up (parent), then back down + // (parent's prev sibling descendants) the tree in a loop + let rightmostPoint = backref; + do { + if (rightmostPoint.type === AstTypes.Pattern) { + // End of the line; capture is not in backref's alternation path + return false; + } + if (rightmostPoint.type === AstTypes.Alternative) { + // Skip past alts to their parent because we don't want to look at the kids of preceding alts + continue; + } + if (rightmostPoint === capture) { + // Capture is ancestor of backref + return false; + } + const kidsOfParent = getKids(rightmostPoint.parent); + for (const kid of kidsOfParent) { + if (kid === rightmostPoint) { + // Reached rightmost node in sibling list that we want to consider; break to parent loop + break; + } + if (kid === capture) { + return true; + } + if (hasDescendant(kid, capture)) { + return true; + } + } + } while ((rightmostPoint = rightmostPoint.parent)); + throw new Error('Unexpected path'); +} + // Creates a deep copy of the provided node, with special handling: // - Make `parent` props point to their parent in the copy // - Update the provided `originMap` for each cloned capturing group (outer and nested) @@ -697,7 +723,7 @@ function cloneCapturingGroup(obj, originMap, up, up2) { const store = Array.isArray(obj) ? [] : {}; for (const [key, value] of Object.entries(obj)) { if (key === 'parent') { - // If the last cloned item was a child container array, use the object above it + // If the last cloned item was a container array (for holding kids), use the object above it store.parent = Array.isArray(up) ? up2 : up; } else if (value && typeof value === 'object') { store[key] = cloneCapturingGroup(value, originMap, store, up); @@ -719,6 +745,7 @@ function createRecursion(ref) { }; } +// See also `getParentAlternative` function getAllParents(node, filterFn) { const results = []; while ((node = node.parent)) { @@ -729,17 +756,14 @@ function getAllParents(node, filterFn) { return results; } -function getChildContainerAccessor(node) { - if (node.alternatives) { - return 'alternatives'; - } - if (node.elements) { - return 'elements'; - } - if (node.classes) { - return 'classes'; +// Returns the string key for the container that holds the node's kids +function getContainerAccessor(node) { + for (const accessor of ['alternatives', 'classes', 'elements']) { + if (node[accessor]) { + return accessor; + } } - throw new Error('Accessor for child container unknown'); + return null; } function getCombinedFlagModsFromFlagNodes(flagNodes) { @@ -784,6 +808,19 @@ function getFlagModsFromFlags({dotAll, ignoreCase}) { return mods; } +function getKids(node) { + if (!node) { + throw new Error('Node expected'); + } + // [NOTE] Not handling `Regex` kids (`pattern`, `flags`) and `CharacterClassRange` kids (`min`, + // `max`) only because not needed by current callers + if (node.type === AstTypes.Quantifier) { + return [node.element]; + } + const accessor = getContainerAccessor(node); + return accessor && node[accessor]; +} + function getLeadingG(els) { if (!els.length) { return null; @@ -844,6 +881,19 @@ function getParentAlternative(node) { return null; } +function hasDescendant(node, descendant) { + const kids = getKids(node) ?? []; + for (const kid of kids) { + if ( + kid === descendant || + hasDescendant(kid, descendant) + ) { + return true; + } + } + return false; +} + function isValidGroupNameJs(name) { // JS group names are more restrictive than Onig; see // @@ -861,7 +911,7 @@ function parseFragment(pattern, {skipPropertyNameValidation} = {}) { } function prepContainer(node, kids) { - const accessor = getChildContainerAccessor(node); + const accessor = getContainerAccessor(node); // Set the parent for the default container of a new node node[accessor][0].parent = node; if (kids) {