From 42624bde2c639ff32327743a751e21a7c1bdfa3a Mon Sep 17 00:00:00 2001 From: Steven Levithan Date: Wed, 6 Nov 2024 13:56:54 +0100 Subject: [PATCH] Add loose mode --- spec/match-assertion.spec.js | 33 ++++++++++++++++----- src/compile.js | 4 +-- src/transform.js | 57 ++++++++++++++++++++---------------- src/utils.js | 7 +++++ 4 files changed, 67 insertions(+), 34 deletions(-) diff --git a/spec/match-assertion.spec.js b/spec/match-assertion.spec.js index 2f4cf3c..4a8bf59 100644 --- a/spec/match-assertion.spec.js +++ b/spec/match-assertion.spec.js @@ -127,6 +127,14 @@ describe('Assertion', () => { expect('a').toExactlyMatch(r`(?=\G)a`); expect('a').toExactlyMatch(r`(?=\Ga)a`); expect('aaba'.match(toRegExp(r`(?=\Ga)a`, '', {global: true}))).toEqual(['a', 'a']); + expect(['a', 'b']).toExactlyMatch(r`(?=\G)a|\Gb`); + // Similar but not covered + [ r`(?=\G|)a`, + r`(?:(?=\G))?a`, + r`(?=\G)a|b`, + ].forEach(pattern => { + expect(() => compile(pattern)).toThrow(); + }); }); it('should allow if trailing in a leading positive lookbehind', () => { @@ -135,6 +143,14 @@ describe('Assertion', () => { let re = toRegExp(r`(?<=a\G)a`); re.lastIndex = 3; expect(re.exec('abaa')?.index).toBe(3); + expect(['a', 'b']).toExactlyMatch(r`(?<=\G)a|\Gb`); + // Similar but not covered + [ r`(?<=\G|)a`, + r`(?:(?<=\G))?a`, + r`(?<=\G)a|b`, + ].forEach(pattern => { + expect(() => compile(pattern)).toThrow(); + }); }); it('should throw if leading in a leading positive lookbehind', () => { @@ -154,7 +170,7 @@ describe('Assertion', () => { expect(() => compile(r`\Ga|\G\Gb`)).toThrow(); }); - // Could support by replacing `\G` with `(?!)` + // Note: Could support by replacing `\G` with `(?!)`, but these forms aren't useful it('should throw at unmatchable positions', () => { expect(() => compile(r`a\Gb`)).toThrow(); expect(() => compile(r`(?<=a\Gb)`)).toThrow(); @@ -162,12 +178,15 @@ describe('Assertion', () => { expect(() => compile(r`(?=ab\G)`)).toThrow(); }); - // Unsupported; some or all might be emulatable - it('should throw for other unsupported uses', () => { - expect(() => compile(r`(?<=\G|)a`)).toThrow(); - expect(() => compile(r`(?:(?<=\G))?a`)).toThrow(); - expect('a').toExactlyMatch(r`(?=\G)a|\Gb`); - expect(() => compile(r`(?=\G)a|b`)).toThrow(); + it('should allow unsupported forms if using loose emulation', () => { + const patterns = [ + r`a\G`, + r`\G|`, + ]; + patterns.forEach(pattern => { + expect(() => compile(pattern)).toThrow(); + expect(toRegExp(pattern, '', {emulation: 'loose'}).sticky).toBe(true); + }); }); describe('subclass strategies', () => { diff --git a/src/compile.js b/src/compile.js index 09398f5..18a9797 100644 --- a/src/compile.js +++ b/src/compile.js @@ -2,13 +2,13 @@ import {generate} from './generate.js'; import {parse} from './parse.js'; import {tokenize} from './tokenize.js'; import {transform} from './transform.js'; -import {EsVersion, Target} from './utils.js'; +import {EmulationMode, EsVersion, Target} from './utils.js'; import {atomic, possessive} from 'regex/atomic'; import {recursion} from 'regex-recursion'; /** @typedef {{ - emulation?: 'strict' | 'default' | 'loose'; + emulation?: keyof EmulationMode; global?: boolean; hasIndices?: boolean; maxRecursionDepth?: number | null; diff --git a/src/transform.js b/src/transform.js index d71644c..a6d2c75 100644 --- a/src/transform.js +++ b/src/transform.js @@ -3,7 +3,7 @@ import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, As import {tokenize} from './tokenize.js'; import {traverse} from './traverse.js'; import {JsUnicodeProperties, PosixClassesMap} from './unicode.js'; -import {cp, getNewCurrentFlags, getOrCreate, isMinTarget, r, Target} from './utils.js'; +import {cp, EmulationMode, getNewCurrentFlags, getOrCreate, isMinTarget, r, Target} from './utils.js'; /** @typedef {{ @@ -22,7 +22,7 @@ then down-convert to the desired JS target version. @param {{ allowSubclassBasedEmulation?: boolean; bestEffortTarget?: keyof Target; - emulation?: 'strict' | 'default' | 'loose'; + emulation?: keyof EmulationMode; }} [options] @returns {RegexAst} */ @@ -40,7 +40,7 @@ function transform(ast, options) { ...options, }; // AST changes that work together with a `RegExp` subclass to add advanced emulation - const strategy = opts.allowSubclassBasedEmulation ? applySubclassStrategies(ast) : null; + const strategy = opts.allowSubclassBasedEmulation ? applySubclassStrategies(ast, opts.emulation) : null; const firstPassState = { emulation: opts.emulation, flagDirectivesByAlt: new Map(), @@ -114,7 +114,7 @@ const FirstPassVisitor = { }, }, - Assertion({node, ast, remove, replaceWith}, {supportedGNodes}) { + Assertion({node, ast, remove, replaceWith}, {emulation, supportedGNodes}) { const {kind, negate} = node; if (kind === AstAssertionKinds.line_end) { // Onig's only line break char is line feed, unlike JS @@ -123,8 +123,8 @@ const FirstPassVisitor = { // Onig's only line break char is line feed, unlike JS replaceWith(parseFragment(r`(?<=\A|\n)`)); } else if (kind === AstAssertionKinds.search_start) { - if (!supportedGNodes.has(node)) { - throw new Error(r`Uses "\G" in a way that's unsupported; try allowSubclassBasedEmulation`); + if (!supportedGNodes.has(node) && emulation !== 'loose') { + throw new Error(r`Uses "\G" in a way that's unsupported`); } ast.flags.sticky = true; remove(); @@ -266,7 +266,7 @@ const FirstPassVisitor = { !node.flags.enable && !node.flags.disable && delete node.flags; }, - Pattern({node}, {supportedGNodes}) { + Pattern({node}, {emulation, supportedGNodes}) { // For `\G` to be accurately emulatable using JS flag y, it must be at (and only at) the start // of every top-level alternative (with complex rules for what determines being at the start). // Additional `\G` error checking in `Assertion` visitor @@ -286,8 +286,8 @@ const FirstPassVisitor = { hasAltWithoutLeadG = true; } } - if (hasAltWithLeadG && hasAltWithoutLeadG) { - throw new Error(r`Uses "\G" in a way that's unsupported; try allowSubclassBasedEmulation`); + if (hasAltWithLeadG && hasAltWithoutLeadG && emulation !== 'loose') { + throw new Error(r`Uses "\G" in a way that's unsupported`); } // These nodes will be removed when traversed; other `\G` nodes will error leadingGs.forEach(g => supportedGNodes.add(g)) @@ -567,7 +567,7 @@ function adoptAndSwapKids(parent, kids) { return parent; } -function applySubclassStrategies(ast) { +function applySubclassStrategies(ast, emulation) { // Special case handling that requires coupling with a `RegExp` subclass (see `WrappedRegExp`). // These changes add emulation support for some common patterns that are otherwise unsupportable. // Only one subclass strategy is supported per pattern @@ -585,7 +585,7 @@ function applySubclassStrategies(ast) { const firstElIn = hasWrapperGroup ? firstEl.alternatives[0].elements[0] : firstEl; const singleAltIn = hasWrapperGroup ? firstEl.alternatives[0] : alts[0]; - // ## Subclass strategy `line_or_search_start`: Support leading `(^|\G)` and similar + // ## Strategy `line_or_search_start`: Support leading `(^|\G)` and similar if ( (firstElIn.type === AstTypes.CapturingGroup || firstElIn.type === AstTypes.Group) && firstElIn.alternatives.length === 2 && @@ -608,7 +608,7 @@ function applySubclassStrategies(ast) { } } - // ## Subclass strategy `not_search_start`: Support leading `(?!\G)` and similar + // ## Strategy `not_search_start`: Support leading `(?!\G)` and similar function isNegG(node) { return isLookaround(node) && node.negate && @@ -628,8 +628,8 @@ function applySubclassStrategies(ast) { return {name: 'not_search_start'}; } - // ## Subclass strategy `after_search_start_or_subpattern`: Support leading `(?<=\G|…)` and - // similar. NB: Leading `(?<=\G)` without other alts is already supported; no need for a subclass + // ## Strategy `after_search_start_or_subpattern`: Support leading `(?<=\G|…)` and similar + // Note: Leading `(?<=\G)` without other alts is already supported; no need for a subclass if ( isLookaround(firstElIn) && !firstElIn.negate && @@ -645,22 +645,29 @@ function applySubclassStrategies(ast) { } }); if (hasGAlt && siblingAlts.length) { + let supported = true; if (siblingAlts.some(alt => alt.elements.some(el => { // Check for nodes that are or can include captures return el.type === AstTypes.CapturingGroup || el.type === AstTypes.Group || el.type === AstTypes.Subroutine || isLookaround(el); }))) { - throw new Error(r`Uses "\G" in a way that's unsupported`); + if (emulation === 'loose') { + supported = false; + } else { + throw new Error(r`Uses "\G" in a way that's unsupported`); + } + } + if (supported) { + // [HACK] Replace the lookbehind with an emulation marker since it isn't easy from here to + // acurately extract what will later become the generated subpattern + const emulationGroup = adoptAndSwapKids(createGroup(), [ + adoptAndSwapKids(createAlternative(), [createUnicodeProperty('<<', {skipPropertyNameValidation: true})]), + ...siblingAlts, + adoptAndSwapKids(createAlternative(), [createUnicodeProperty('>>', {skipPropertyNameValidation: true})]), + ]); + emulationGroup.parent = firstElIn.parent; + firstElIn.parent.elements[0] = emulationGroup; + return {name: 'after_search_start_or_subpattern'}; } - // [HACK] Replace the lookbehind with an emulation marker since from here it isn't easy to - // acurately extract what will later become the generated subpattern - const emulationGroup = adoptAndSwapKids(createGroup(), [ - adoptAndSwapKids(createAlternative(), [createUnicodeProperty('<<', {skipPropertyNameValidation: true})]), - ...siblingAlts, - adoptAndSwapKids(createAlternative(), [createUnicodeProperty('>>', {skipPropertyNameValidation: true})]), - ]); - emulationGroup.parent = firstElIn.parent; - firstElIn.parent.elements[0] = emulationGroup; - return {name: 'after_search_start_or_subpattern'}; } } return null; diff --git a/src/utils.js b/src/utils.js index d1468cf..25cc0a1 100644 --- a/src/utils.js +++ b/src/utils.js @@ -1,6 +1,12 @@ const cp = String.fromCodePoint; const r = String.raw; +const EmulationMode = /** @type {const} */ ({ + strict: 'strict', + default: 'default', + loose: 'loose', +}); + const EsVersion = { ES2018: 2018, ES2024: 2024, @@ -45,6 +51,7 @@ function throwIfNot(value, msg) { export { cp, + EmulationMode, EsVersion, getNewCurrentFlags, getOrCreate,