From 4f116aa14a45df3f9cc0062645ce601323558db0 Mon Sep 17 00:00:00 2001
From: Steven Levithan
Date: Thu, 19 Dec 2024 03:32:41 +0100
Subject: [PATCH] Add rules.captureName option
---
README.md | 42 ++++++++---
demo/demo.css | 6 +-
demo/demo.js | 1 +
demo/index.html | 121 +++++++++++++++++--------------
scripts/utils.js | 4 +-
spec/helpers/matchers.js | 5 +-
spec/options.spec.js | 150 +++++++++++++++++++++++++++++++++++++++
spec/todetails.spec.js | 36 +---------
src/index.js | 10 ++-
src/options.js | 5 +-
src/parse.js | 4 +-
src/tokenize.js | 56 +++++++++------
src/transform.js | 39 ++++++----
13 files changed, 334 insertions(+), 145 deletions(-)
create mode 100644 spec/options.spec.js
diff --git a/README.md b/README.md
index 16e6204..7c87875 100644
--- a/README.md
+++ b/README.md
@@ -81,6 +81,7 @@ type OnigurumaToEsOptions = {
allowOrphanBackrefs?: boolean;
allowUnhandledGAnchors?: boolean;
asciiWordBoundaries?: boolean;
+ captureGroup?: boolean;
};
target?: 'auto' | 'ES2025' | 'ES2024' | 'ES2018';
verbose?: boolean;
@@ -117,6 +118,9 @@ function toOnigurumaAst(
pattern: string,
options?: {
flags?: string;
+ rules?: {
+ captureGroup?: boolean;
+ };
}
): OnigurumaAst;
```
@@ -210,7 +214,8 @@ Advanced pattern options that override standard error checking and flags when en
- `allowOrphanBackrefs`: Useful with TextMate grammars that merge backreferences across patterns.
- `allowUnhandledGAnchors`: Applies flag `y` for unsupported uses of `\G`, rather than erroring.
- Oniguruma-To-ES uses a variety of strategies to accurately emulate many common uses of `\G`. When using this option, if a `\G` is found that doesn't have a known emulation strategy, the `\G` is simply removed and JavaScript's `y` (`sticky`) flag is added. This might lead to some false positives and negatives, but is useful for non-critical matching (like syntax highlighting) when having some mismatches is better than not working.
-- `asciiWordBoundaries`: Use ASCII-based `\b` and `\B`, which increases performance.
+- `asciiWordBoundaries`: Use ASCII-based `\b` and `\B`, which increases search performance of generated regexes.
+- `captureGroup`: Oniguruma option `ONIG_OPTION_CAPTURE_GROUP`. Unnamed captures and numbered calls allowed when using named capture.
### `target`
@@ -616,7 +621,7 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ Always "multiline"
✔ Only \n as newline
- ✔ No match after string-terminating \n
+ ✔ ^ doesn't match after string-terminating \n
@@ -911,6 +916,17 @@ Notice that nearly every feature below has at least subtle differences from Java
✔ Error
+
+
+
Compile-time options
+
ONIG_OPTION_CAPTURE_GROUP
+
✅
+
✅
+
+ ✔ Unnamed captures and numbered calls allowed when using named capture
+ ✔ Allows numbered subroutine refs to duplicate group names
+
+
The table above doesn't include all aspects that Oniguruma-To-ES emulates (including error handling, most aspects that work the same as in JavaScript, and many aspects of non-JavaScript features that work the same in the other regex flavors that support them).
@@ -928,14 +944,20 @@ The table above doesn't include all aspects that Oniguruma-To-ES emulates (inclu
The following don't yet have any support, and throw errors. They're all infrequently-used features, with most being *extremely* rare.
-- Grapheme boundaries: `\y`, `\Y`.
-- Flags `P` (POSIX is ASCII) and `y{g}`/`y{w}` (grapheme boundary modes).
-- Whole-pattern modifiers: Don't capture `(?C)`, ignore-case is ASCII `(?I)`, find longest `(?L)`.
-- Absence functions: `(?~…)`, etc.
-- Conditionals: `(?(…)…)`, etc.
-- Rarely-used character specifiers: Non-A-Za-z with `\cx`, `\C-x`; meta `\M-x`, `\M-\C-x`; bracketed octals `\o{…}`; octal UTF-8 encoded bytes (≥ `\200`).
-- Code point sequences: `\x{H H …}`, `\o{O O …}`.
-- Callout functions: `(?{…})`, etc.
+- Supportable:
+ - Grapheme boundaries: `\y`, `\Y`.
+ - Flags `P` (POSIX is ASCII) and `y{g}`/`y{w}` (grapheme boundary modes).
+ - Rarely-used character specifiers: Non-A-Za-z with `\cx`, `\C-x`; meta `\M-x`, `\M-\C-x`; bracketed octals `\o{…}`; octal UTF-8 encoded bytes (≥ `\200`).
+ - Code point sequences: `\x{H H …}`, `\o{O O …}`.
+ - Whole-pattern modifiers: Don't capture `(?C)`, ignore-case is ASCII `(?I)`.
+- Supportable for some uses:
+ - Absence functions: `(?~…)`, etc.
+ - Conditionals: `(?(…)…)`, etc.
+ - Whole-pattern modifiers: Find longest `(?L)`.
+- Not supportable:
+ - Callout functions: `(?{…})`, etc.
+
+Despite the current omissions, Oniguruma-To-ES handles more than 99.9% of real-world Oniguruma regexes, based on patterns used in a large [collection](https://github.com/shikijs/textmate-grammars-themes/tree/main/packages/tm-grammars/grammars) of TextMate grammars.
## ㊗️ Unicode / mixed case-sensitivity
diff --git a/demo/demo.css b/demo/demo.css
index 821b95c..c070394 100644
--- a/demo/demo.css
+++ b/demo/demo.css
@@ -162,12 +162,12 @@ pre, code, kbd, textarea {
border-radius: 0.375em;
}
-#more-options {
+#more-options-cols {
display: flex;
}
-#more-options div {
- margin-right: 3%;
+#more-options-cols div {
+ margin-right: 5%;
}
#output, textarea {
diff --git a/demo/demo.js b/demo/demo.js
index eea024a..828b0e2 100644
--- a/demo/demo.js
+++ b/demo/demo.js
@@ -24,6 +24,7 @@ const state = {
allowOrphanBackrefs: getValue('option-allowOrphanBackrefs'),
allowUnhandledGAnchors: getValue('option-allowUnhandledGAnchors'),
asciiWordBoundaries: getValue('option-asciiWordBoundaries'),
+ captureGroup: getValue('option-captureGroup'),
},
target: getValue('option-target'),
verbose: getValue('option-verbose'),
diff --git a/demo/index.html b/demo/index.html
index 00f486c..48f19e2 100644
--- a/demo/index.html
+++ b/demo/index.html
@@ -74,31 +74,74 @@
Try it
More options
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/scripts/utils.js b/scripts/utils.js
index 3b4c3a4..6e99b0f 100644
--- a/scripts/utils.js
+++ b/scripts/utils.js
@@ -76,7 +76,9 @@ function getMatchDetails(match) {
const transpiledRegExpResult = (pattern, str, pos) => {
let result;
try {
- const options = {};
+ // `vscode-oniguruma` uses option `ONIG_OPTION_CAPTURE_GROUP` by default; see
+ //
+ const options = {rules: {captureGroup: true}};
if (pos) {
options.global = true;
}
diff --git a/spec/helpers/matchers.js b/spec/helpers/matchers.js
index f8b876a..f0f3303 100644
--- a/spec/helpers/matchers.js
+++ b/spec/helpers/matchers.js
@@ -12,6 +12,7 @@ function getArgs(actual, expected) {
pattern: typeof expected === 'string' ? expected : expected.pattern,
flags: expected.flags ?? '',
accuracy: expected.accuracy ?? 'default',
+ rules: expected.rules ?? {},
strings: Array.isArray(actual) ? actual : [actual],
targets: targeted,
};
@@ -24,9 +25,9 @@ function wasFullStrMatch(match, str) {
// Expects `negate` to be set by `negativeCompare` and doesn't rely on Jasmine's automatic matcher
// negation because when negated we don't want to early return `true` when looping over the array
// of strings and one is found to not match; they all need to not match
-function matchWithAllTargets({pattern, flags, strings, targets, accuracy}, {exact, negate}) {
+function matchWithAllTargets({pattern, flags, accuracy, rules, strings, targets}, {exact, negate}) {
for (const target of targets) {
- const re = toRegExp(pattern, {accuracy, flags, target});
+ const re = toRegExp(pattern, {accuracy, flags, rules, target});
for (const str of strings) {
// In case the regex includes flag g or y
re.lastIndex = 0;
diff --git a/spec/options.spec.js b/spec/options.spec.js
new file mode 100644
index 0000000..9bf6d62
--- /dev/null
+++ b/spec/options.spec.js
@@ -0,0 +1,150 @@
+import {toDetails} from '../dist/index.mjs';
+import {envSupportsFlagV, r} from '../src/utils.js';
+import {matchers} from './helpers/matchers.js';
+
+beforeEach(() => {
+ jasmine.addMatchers(matchers);
+});
+
+describe('Options', () => {
+ describe('flags', () => {
+ it('should accept and translate supported flags', () => {
+ expect(toDetails('', {flags: 'i'}).flags).toContain('i');
+ expect(toDetails('', {flags: 'm'}).flags).toContain('s');
+ expect(toDetails('', {flags: 'm'}).flags).not.toContain('m');
+ expect(toDetails('', {flags: 'x'}).flags).not.toContain('x');
+ expect(toDetails('', {flags: 'D'}).flags).not.toContain('D');
+ expect(toDetails('', {flags: 'S'}).flags).not.toContain('S');
+ expect(toDetails('', {flags: 'W'}).flags).not.toContain('W');
+ });
+
+ it('should throw for unexpected flags', () => {
+ expect(() => toDetails('', {flags: 'd'})).toThrow();
+ expect(() => toDetails('', {flags: 'g'})).toThrow();
+ expect(() => toDetails('', {flags: 's'})).toThrow();
+ expect(() => toDetails('', {flags: 'u'})).toThrow();
+ expect(() => toDetails('', {flags: 'v'})).toThrow();
+ expect(() => toDetails('', {flags: 'y'})).toThrow();
+ });
+ });
+
+ describe('target', () => {
+ it('should set target based on env for target auto', () => {
+ if (envSupportsFlagV) {
+ expect(toDetails('', {target: 'auto'}).flags).toBe('v');
+ } else {
+ expect(toDetails('', {target: 'auto'}).flags).toBe('u');
+ }
+ });
+
+ it('should use target auto if unspecified', () => {
+ if (envSupportsFlagV) {
+ expect(toDetails('').flags).toBe('v');
+ } else {
+ expect(toDetails('').flags).toBe('u');
+ }
+ });
+
+ it('should add flag v for target ES2024+', () => {
+ expect(toDetails('', {target: 'ES2024'}).flags).toBe('v');
+ expect(toDetails('', {target: 'ES2025'}).flags).toBe('v');
+ });
+
+ it('should add flag u for target ES2018', () => {
+ expect(toDetails('', {target: 'ES2018'}).flags).toBe('u');
+ });
+
+ it('should throw for unexpected targets', () => {
+ expect(() => toDetails('', {target: 'ES6'})).toThrow();
+ expect(() => toDetails('', {target: 'ES2019'})).toThrow();
+ });
+ });
+
+ describe('rules', () => {
+ describe('captureGroup', () => {
+ it('enables mixed unnamed and named capture', () => {
+ expect('aba').toExactlyMatch({
+ pattern: r`(a)(?b)\1`,
+ rules: {captureGroup: true},
+ });
+ expect('abb').toExactlyMatch({
+ pattern: r`(a)(?b)\2`,
+ rules: {captureGroup: true},
+ });
+ // Without `rules.captureGroup`
+ expect(() => toDetails(r`(a)(?b)\1`)).toThrow();
+ });
+
+ it('no multiplexing for numbered backrefs to named capture', () => {
+ expect('abb').toExactlyMatch({
+ pattern: r`(?a)(?b)\2`,
+ rules: {captureGroup: true},
+ });
+ expect('aba').not.toFindMatch({
+ pattern: r`(?a)(?b)\2`,
+ rules: {captureGroup: true},
+ });
+ });
+
+ it('multiplexing preserved for named backrefs', () => {
+ expect(['abcb', 'abcc']).toExactlyMatch({
+ pattern: r`(a)(?b)(?c)\k`,
+ rules: {captureGroup: true},
+ });
+ expect('abca').not.toFindMatch({
+ pattern: r`(a)(?b)(?c)\k`,
+ rules: {captureGroup: true},
+ });
+ });
+
+ it('backrefs rematch the most recent of a set with subroutines and unnamed capture', () => {
+ expect('abcc').toExactlyMatch({
+ pattern: r`(.)(?b)\g<1>\1`,
+ rules: {captureGroup: true},
+ });
+ expect('abca').not.toFindMatch({
+ pattern: r`(.)(?b)\g<1>\1`,
+ rules: {captureGroup: true},
+ });
+ });
+
+ it('backrefs rematch the most recent of a set with subroutines and named capture', () => {
+ expect('abcc').toExactlyMatch({
+ pattern: r`(a)(?.)\g<2>\2`,
+ rules: {captureGroup: true},
+ });
+ expect('abcb').not.toFindMatch({
+ pattern: r`(a)(?.)\g<2>\2`,
+ rules: {captureGroup: true},
+ });
+ expect('abcb').not.toFindMatch({
+ pattern: r`(a)(?.)\g<2>\k`,
+ rules: {captureGroup: true},
+ });
+ });
+
+ it('allows numbered subroutine refs to duplicate group names', () => {
+ expect(['abca', 'abcc']).toExactlyMatch({
+ pattern: r`(?.)(?.)\g<2>\k`,
+ rules: {captureGroup: true},
+ });
+ expect('abcb').not.toFindMatch({
+ pattern: r`(?.)(?.)\g<2>\k`,
+ rules: {captureGroup: true},
+ });
+ expect(['abcdc', 'abcdd']).toExactlyMatch({
+ pattern: r`(a)(?.)(?.)\g<2>\k`,
+ rules: {captureGroup: true},
+ });
+ expect('abcdb').not.toFindMatch({
+ pattern: r`(a)(?.)(?.)\g<2>\k`,
+ rules: {captureGroup: true},
+ });
+ });
+ });
+
+ // TODO: Add remaining
+ });
+
+ // TODO: Add remaining
+});
diff --git a/spec/todetails.spec.js b/spec/todetails.spec.js
index fc9d0e6..521bdd9 100644
--- a/spec/todetails.spec.js
+++ b/spec/todetails.spec.js
@@ -5,7 +5,7 @@ describe('toDetails', () => {
expect(Object.keys(toDetails(''))).toEqual(['pattern', 'flags']);
});
- it('should throw for non-string pattern', () => {
+ it('should throw for non-string patterns', () => {
expect(() => toDetails()).toThrow();
for (const value of [undefined, null, 0, false, [], {}, /(?:)/]) {
expect(() => toDetails(value)).toThrow();
@@ -15,38 +15,4 @@ describe('toDetails', () => {
it('should return an empty pattern if given an empty string', () => {
expect(toDetails('').pattern).toBe('');
});
-
- it('should accept and translate supported flags', () => {
- expect(toDetails('', {flags: 'i'}).flags).toContain('i');
- expect(toDetails('', {flags: 'm'}).flags).toContain('s');
- expect(toDetails('', {flags: 'm'}).flags).not.toContain('m');
- expect(toDetails('', {flags: 'x'}).flags).not.toContain('x');
- });
-
- it('should throw for unexpected flags', () => {
- expect(() => toDetails('', {flags: 'd'})).toThrow();
- expect(() => toDetails('', {flags: 'g'})).toThrow();
- expect(() => toDetails('', {flags: 's'})).toThrow();
- expect(() => toDetails('', {flags: 'u'})).toThrow();
- expect(() => toDetails('', {flags: 'v'})).toThrow();
- expect(() => toDetails('', {flags: 'y'})).toThrow();
- });
-
- it('should add flag v if target unspecified', () => {
- expect(toDetails('').flags).toBe('v');
- });
-
- it('should add flag v for target ES2024+', () => {
- expect(toDetails('', {target: 'ES2024'}).flags).toBe('v');
- expect(toDetails('', {target: 'ES2025'}).flags).toBe('v');
- });
-
- it('should add flag u for target ES2018', () => {
- expect(toDetails('', {target: 'ES2018'}).flags).toBe('u');
- });
-
- it('should throw for unexpected targets', () => {
- expect(() => toDetails('', {target: 'ES6'})).toThrow();
- expect(() => toDetails('', {target: 'ES2019'})).toThrow();
- });
});
diff --git a/src/index.js b/src/index.js
index 0be7a98..768a7e8 100644
--- a/src/index.js
+++ b/src/index.js
@@ -32,6 +32,7 @@ import {recursion} from 'regex-recursion';
allowOrphanBackrefs?: boolean;
allowUnhandledGAnchors?: boolean;
asciiWordBoundaries?: boolean;
+ captureGroup?: boolean;
};
target?: keyof Target;
verbose?: boolean;
@@ -50,7 +51,7 @@ Accepts an Oniguruma pattern and returns the details needed to construct an equi
*/
function toDetails(pattern, options) {
const opts = getOptions(options);
- const tokenized = tokenize(pattern, opts.flags);
+ const tokenized = tokenize(pattern, opts.flags, {captureGroup: opts.rules.captureGroup});
const onigurumaAst = parse(tokenized, {
skipBackrefValidation: opts.rules.allowOrphanBackrefs,
verbose: opts.verbose,
@@ -85,11 +86,16 @@ Returns an Oniguruma AST generated from an Oniguruma pattern.
@param {string} pattern Oniguruma regex pattern.
@param {{
flags?: string;
+ rules?: {
+ captureGroup?: boolean;
+ };
}} [options]
@returns {import('./parse.js').OnigurumaAst}
*/
function toOnigurumaAst(pattern, options) {
- return parse(tokenize(pattern, options?.flags));
+ const flags = options?.flags ?? '';
+ const captureGroup = options?.rules?.captureGroup ?? false;
+ return parse(tokenize(pattern, flags, {captureGroup}));
}
/**
diff --git a/src/options.js b/src/options.js
index 51dd32b..429d739 100644
--- a/src/options.js
+++ b/src/options.js
@@ -57,8 +57,11 @@ function getOptions(options) {
allowOrphanBackrefs: false,
// Applies flag `y` for unsupported uses of `\G`, rather than erroring.
allowUnhandledGAnchors: false,
- // Use ASCII-based `\b` and `\B`, which increases performance.
+ // Use ASCII-based `\b` and `\B`, which increases search performance of generated regexes.
asciiWordBoundaries: false,
+ // Oniguruma option `ONIG_OPTION_CAPTURE_GROUP`. Unnamed captures and numbered calls allowed
+ // when using named capture.
+ captureGroup: false,
...(options?.rules),
},
};
diff --git a/src/parse.js b/src/parse.js
index aaf1ffa..c1c47ed 100644
--- a/src/parse.js
+++ b/src/parse.js
@@ -64,7 +64,7 @@ const AstVariableLengthCharacterSetKinds = {
}} [options]
@returns {OnigurumaAst}
*/
-function parse({tokens, flags}, options) {
+function parse({tokens, flags, rules}, options) {
const opts = {
skipBackrefValidation: false,
skipPropertyNameValidation: false,
@@ -135,7 +135,7 @@ function parse({tokens, flags}, options) {
// `context` updated by preceding `walk` loop
const {capturingGroups, hasNumberedRef, namedGroupsByName, subroutines} = context;
// Validation that requires knowledge about the complete pattern
- if (hasNumberedRef && namedGroupsByName.size) {
+ if (hasNumberedRef && namedGroupsByName.size && !rules.captureGroup) {
throw new Error('Numbered backref/subroutine not allowed when using named capture');
}
for (const {ref} of subroutines) {
diff --git a/src/tokenize.js b/src/tokenize.js
index 5d67bb1..a06b413 100644
--- a/src/tokenize.js
+++ b/src/tokenize.js
@@ -125,22 +125,33 @@ const charClassTokenRe = new RegExp(r`
extended: boolean;
ignoreCase: boolean;
};
+ rules: {
+ captureGroup: boolean;
+ };
}} TokenizerResult
*/
/**
-@param {string} pattern
-@param {string} [flags] Oniguruma flags. Flag `m` is equivalent to JS flag `s`.
+@param {string} pattern Oniguruma pattern.
+@param {string} [flags] Oniguruma flags.
+@param {{captureGroup?: boolean;}} [rules] Oniguruma compile-time options.
@returns {TokenizerResult}
*/
-function tokenize(pattern, flags = '') {
+function tokenize(pattern, flags = '', rules) {
+ rules = {
+ // `ONIG_OPTION_CAPTURE_GROUP`
+ captureGroup: false,
+ ...rules,
+ };
if (typeof pattern !== 'string') {
throw new Error('String expected as pattern');
}
if (!/^[imxDSW]*$/.test(flags)) {
throw new Error(`Flags "${flags}" includes unsupported value`);
}
- const xStack = [flags.includes('x')];
+ const extended = flags.includes('x');
+ const xStack = [extended];
const context = {
+ captureGroup: rules.captureGroup,
getCurrentModX: () => xStack.at(-1),
numOpenGroups: 0,
popModX() {xStack.pop()},
@@ -163,25 +174,24 @@ function tokenize(pattern, flags = '') {
}
const potentialUnnamedCaptureTokens = [];
- let numNamedCaptures = 0;
+ let numNamedAndOptInUnnamedCaptures = 0;
tokens.forEach(t => {
if (t.type === TokenTypes.GroupOpen) {
if (t.kind === TokenGroupKinds.capturing) {
- numNamedCaptures++;
- t.number = numNamedCaptures;
+ t.number = ++numNamedAndOptInUnnamedCaptures;
} else if (t.raw === '(') {
potentialUnnamedCaptureTokens.push(t);
}
}
});
- // Enable unnamed capturing groups if no named captures
- if (!numNamedCaptures) {
+ // Enable unnamed capturing groups if no named captures (when `captureGroup` not enabled)
+ if (!numNamedAndOptInUnnamedCaptures) {
potentialUnnamedCaptureTokens.forEach((t, i) => {
t.kind = TokenGroupKinds.capturing;
t.number = i + 1;
});
}
- const numCaptures = numNamedCaptures || potentialUnnamedCaptureTokens.length;
+ const numCaptures = numNamedAndOptInUnnamedCaptures || potentialUnnamedCaptureTokens.length;
// Can now split escaped nums accurately, accounting for number of captures
tokens = tokens.map(
t => t.type === TokenTypes.EscapedNumber ? splitEscapedNumToken(t, numCaptures) : t
@@ -195,12 +205,13 @@ function tokenize(pattern, flags = '') {
// is equivalent to JS flag s
dotAll: flags.includes('m'),
// Flag x is fully handled during tokenization
- extended: flags.includes('x'),
+ extended,
// Flags D, S, W are currently only supported as top-level flags
digitIsAscii: flags.includes('D'),
spaceIsAscii: flags.includes('S'),
wordIsAscii: flags.includes('W'),
},
+ rules,
};
}
@@ -296,8 +307,9 @@ function getTokenWithDetails(context, pattern, m, lastIndex) {
context.pushModX(context.getCurrentModX());
context.numOpenGroups++;
if (
- // Unnamed capture if no named captures, else noncapturing group
- m === '(' ||
+ // Unnamed capture if no named captures present and `captureGroup` not enabled, else
+ // noncapturing group
+ (m === '(' && !context.captureGroup) ||
// Noncapturing group
m === '(?:'
) {
@@ -325,14 +337,18 @@ function getTokenWithDetails(context, pattern, m, lastIndex) {
}),
};
}
- // Named capture (checked after lookbehind due to similar syntax)
- if (m2 === '<' || m2 === "'") {
+ // Named capture (checked after lookbehind due to similar syntax), or unnamed capture when
+ // `captureGroup` enabled
+ if (m2 === '<' || m2 === "'" || (m === '(' && context.captureGroup)) {
+ const token = createToken(TokenTypes.GroupOpen, m, {
+ kind: TokenGroupKinds.capturing,
+ // Will add `number` in a second pass
+ });
+ if (m !== '(') {
+ token.name = m.slice(3, -1);
+ }
return {
- token: createToken(TokenTypes.GroupOpen, m, {
- kind: TokenGroupKinds.capturing,
- name: m.slice(3, -1),
- // Will add `number` in a second pass
- }),
+ token,
}
}
if (m2 === '(') {
diff --git a/src/transform.js b/src/transform.js
index 72b0e23..ffeea14 100644
--- a/src/transform.js
+++ b/src/transform.js
@@ -161,7 +161,10 @@ const FirstPassVisitor = {
if (name && !isValidGroupNameJs(name)) {
throw new Error(`Group name "${name}" invalid in JS`);
}
- subroutineRefMap.set(name ?? number, node);
+ subroutineRefMap.set(number, node);
+ if (name) {
+ subroutineRefMap.set(name, node);
+ }
},
CharacterSet({node, replaceWith}, {accuracy, minTargetEs2024, digitIsAscii, spaceIsAscii, wordIsAscii}) {
@@ -361,7 +364,7 @@ const SecondPassVisitor = {
Backreference({node}, {multiplexCapturesToLeftByRef, reffedNodesByReferencer}) {
const {orphan, ref} = node;
if (!orphan) {
- // Copy the current state for later multiplexing expansion. It's done in a subsequent pass
+ // Copy the current state for later multiplexing expansion. That's done in a subsequent pass
// because backref numbers need to be recalculated after subroutine expansion
reffedNodesByReferencer.set(node, [...multiplexCapturesToLeftByRef.get(ref).map(({node}) => node)]);
}
@@ -399,25 +402,27 @@ const SecondPassVisitor = {
) {
// Has value if we're within a subroutine expansion
const origin = groupOriginByCopy.get(node);
- const ref = node.name ?? node.number;
// ## Handle recursion; runs after subroutine expansion
- if (origin && openRefs.has(ref)) {
+ if (origin && openRefs.has(node.number)) {
// Recursion doesn't affect any following backrefs to its `ref` (unlike other subroutines),
// so don't wrap with a capture. The reffed group might have its name removed due to later
// subroutine expansion
- const recursion = createRecursion(ref);
- reffedNodesByReferencer.set(recursion, openRefs.get(ref));
+ const recursion = createRecursion(node.number);
+ reffedNodesByReferencer.set(recursion, openRefs.get(node.number));
replaceWith(recursion);
// This node's kids have been removed from the tree, so no need to traverse them
skip();
return;
}
- // Name or number; not mixed since can't use numbered subroutines with named capture
- openRefs.set(ref, node);
+ openRefs.set(node.number, node);
// ## Track data for backref multiplexing
- const multiplexNodes = getOrCreate(multiplexCapturesToLeftByRef, ref, []);
+ multiplexCapturesToLeftByRef.set(node.number, []);
+ if (node.name) {
+ getOrCreate(multiplexCapturesToLeftByRef, node.name, []);
+ }
+ const multiplexNodes = multiplexCapturesToLeftByRef.get(node.name ?? node.number);
for (let i = 0; i < multiplexNodes.length; i++) {
// Captures added via subroutine expansion (maybe indirectly because they were descendant
// captures of the reffed group or in a nested subroutine expansion) form a set with their
@@ -438,7 +443,10 @@ const SecondPassVisitor = {
break;
}
}
- multiplexNodes.push({node, origin});
+ multiplexCapturesToLeftByRef.get(node.number).push({node, origin});
+ if (node.name) {
+ multiplexCapturesToLeftByRef.get(node.name).push({node, origin});
+ }
// ## Track data for duplicate names within an alternation path
// Pre-ES2025 doesn't allow duplicate names, but ES2025+ allows duplicate names that are
@@ -459,7 +467,7 @@ const SecondPassVisitor = {
}
},
exit({node}, {openRefs}) {
- openRefs.delete(node.name ?? node.number);
+ openRefs.delete(node.number);
},
},
@@ -483,7 +491,7 @@ const SecondPassVisitor = {
// Other forms of recursion are handled by the `CapturingGroup` visitor
const isGlobalRecursion = ref === 0;
const expandedSubroutine = isGlobalRecursion ?
- createRecursion(ref) :
+ createRecursion(0) :
// The reffed group might itself contain subroutines, which are expanded during sub-traversal
cloneCapturingGroup(reffedGroupNode, state.groupOriginByCopy, null);
let replacement = expandedSubroutine;
@@ -790,8 +798,11 @@ function isValidGroupNameJs(name) {
// Returns a single node, either the given node or all nodes wrapped in a noncapturing group
function parseFragment(pattern, options) {
- const skipPropertyNameValidation = !!options?.skipPropertyNameValidation;
- const ast = parse(tokenize(pattern), {skipPropertyNameValidation});
+ const opts = {
+ skipPropertyNameValidation: false,
+ ...options,
+ };
+ const ast = parse(tokenize(pattern), opts);
const alts = ast.pattern.alternatives;
if (alts.length > 1 || alts[0].elements.length > 1) {
return adoptAndSwapKids(createGroup(), alts);