Skip to content

Commit

Permalink
Subclass strategies
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Nov 4, 2024
1 parent e9b3ff4 commit 3f6754a
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 42 deletions.
8 changes: 4 additions & 4 deletions scripts/onig-compare.js
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ compare([
[r`[\O]`, `O`],
[r`\o`, `o`],
[r`[\o]`, `o`],
[r`\o{1}`, `\u{1}`],
[r`[\o{1}]`, `\u{1}`],
[r`\o{1}`, `\u{1}`, `Octal code points not yet supported`],
[r`[\o{1}]`, `\u{1}`, `Octal code points not yet supported`],
[r`\p`, `p`],
[r`[\p]`, `p`],
[r`\p{`, `p{`],
Expand Down Expand Up @@ -69,9 +69,9 @@ compare([
[r`\x{10FFFF}`, `\u{10FFFF}`],
[r`\x{0010FFFF}`, `\u{10FFFF}`], // 8 hex digits
[r`\x{00010FFFF}`, `\u{10FFFF}`], // 9 hex digits
[r`\x{13FFFF}`, ``, r`Beyond Unicode range: JS doesn't support`],
[r`\x{13FFFF}`, ``, `Beyond Unicode range: JS doesn't support`],
[r`\x{140000}`, ``],
[r`\x{0 1}`, `\u{0}\u{1}`],
[r`\x{0 1}`, `\u{0}\u{1}`, `Code point sequences not yet supported`],
[r`\💖`, '💖'],
[`\\\u{10000}`, '\u{10000}'],
]);
Expand Down
13 changes: 12 additions & 1 deletion spec/match-assertion.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ describe('Assertion', () => {
const opts = {allowSubclass: true};

// Leading `(^|\G)` and similar
it('should apply start_of_search_or_line', () => {
it('should apply search_or_line_start', () => {
expect(toRegExp(r`(^|\G)a`, '', opts).exec('b\na')?.index).toBe(2);
// Should match first 3 and last 1
expect('aaabaaacaa\na'.match(toRegExp(
Expand All @@ -163,6 +163,17 @@ describe('Assertion', () => {
// Leading `(?!\G)`
it('should apply not_search_start', () => {
expect(toRegExp(r`(?!\G)a`, '', opts).exec('aba')?.index).toBe(2);
expect(toRegExp(r`(?:(?!\G)a)`, '', opts).exec('aba')?.index).toBe(2);
expect(toRegExp(r`((?!\G)a)`, '', opts).exec('aba')?.index).toBe(2);
});

// Leading `(?<=\G|…)`
it('should apply after_search_start_or_subpattern', () => {
expect(toRegExp(r`(?<=\G|a)b`, '', opts).exec('ba')?.index).toBe(0);
expect(toRegExp(r`(?<=\G|a)b`, '', opts).exec('aba')?.index).toBe(1);
expect(toRegExp(r`(?<=\G|a)b`, '', opts).exec('aaba')?.index).toBe(2);
expect(toRegExp(r`(?<=\G|a)b`, '', opts).exec('cbbab')?.index).toBe(4);
expect(toRegExp(r`(?<=\G|a)b`, '', opts).exec('cbba')).toBeNull();
});
});
});
Expand Down
34 changes: 22 additions & 12 deletions src/compile.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,31 +29,41 @@ Transpiles an Oniguruma regex pattern and flags to native JS.
*/
function compile(pattern, flags, options) {
const opts = getOptions(options);
const tokenized = tokenize(pattern, flags);
const onigurumaAst = parse(tokenized, {
optimize: opts.optimize,
});
const regexAst = transform(onigurumaAst, {
const transformOpts = {
allowBestEffort: opts.allowBestEffort,
allowSubclass: opts.allowSubclass,
bestEffortTarget: opts.target,
});
const generated = generate(regexAst, opts);
const result = {
pattern: atomic(possessive(recursion(generated.pattern))),
flags: `${opts.hasIndices ? 'd' : ''}${opts.global ? 'g' : ''}${generated.flags}${generated.options.disable.v ? 'u' : 'v'}`,
};
if (regexAst._strategy) {
const tokenized = tokenize(pattern, flags);
const onigurumaAst = parse(tokenized, {optimize: opts.optimize});
const result = getResultFromOnigurumaAst(onigurumaAst, opts, transformOpts);
if (result._internal) {
result._internal = {
pattern: result.pattern,
strategy: regexAst._strategy,
strategy: result._internal.strategy,
subpattern: (result._internal.subtree ?
getResultFromOnigurumaAst(result._internal.subtree, opts, transformOpts).pattern :
null),
};
// Hide the pattern since it's not accurate unless `toRegExp` constructs it with a subclass
result.pattern = null;
}
return result;
}

function getResultFromOnigurumaAst(onigurumaAst, opts, transformOpts) {
const regexAst = transform(onigurumaAst, transformOpts);
const generated = generate(regexAst, opts);
const result = {
pattern: atomic(possessive(recursion(generated.pattern))),
flags: `${opts.hasIndices ? 'd' : ''}${opts.global ? 'g' : ''}${generated.flags}${generated.options.disable.v ? 'u' : 'v'}`,
};
if (regexAst._strategy) {
result._internal = regexAst._strategy;
}
return result;
}

/**
Returns a complete set of options, with default values set for options that weren't provided.
@param {CompileOptions} [options]
Expand Down
49 changes: 39 additions & 10 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,27 +46,27 @@ Transpiles an Oniguruma regex pattern and flags and returns a native JS RegExp.
function toRegExp(pattern, flags, options) {
const result = compile(pattern, flags, options);
if (result._internal) {
return new WrappedRegExp(result._internal.pattern, result.flags, result._internal.strategy);
return new WrappedRegExp(result._internal.pattern, result.flags, result._internal);
}
return new RegExp(result.pattern, result.flags);
}

class WrappedRegExp extends RegExp {
#strategy;
#data;
/**
@param {string | WrappedRegExp} pattern
@param {string} [flags]
@param {string} [strategy]
@param {string} [data]
*/
constructor(pattern, flags, strategy) {
constructor(pattern, flags, data) {
super(pattern, flags);
if (strategy) {
this.#strategy = strategy;
// The third argument `strategy` isn't provided when regexes are copied as part of the internal
if (data) {
this.#data = data;
// The third argument `data` isn't provided when regexes are copied as part of the internal
// handling of string methods `matchAll` and `split`
} else if (pattern instanceof WrappedRegExp) {
// Can read private properties of the existing object since it was created by this class
this.#strategy = pattern.#strategy;
this.#data = pattern.#data;
}
}
/**
Expand All @@ -76,10 +76,15 @@ class WrappedRegExp extends RegExp {
@returns {RegExpExecArray | null}
*/
exec(str) {
// Special case handling that requires coupling with changes for the specific strategy in the
// transformer. These changes add emulation support for some common patterns that are otherwise
// unsupportable. Only one subclass strategy is supported per pattern
const useLastIndex = this.global || this.sticky;
const pos = this.lastIndex;
const exec = RegExp.prototype.exec;
if (this.#strategy === 'start_of_search_or_line' && useLastIndex && this.lastIndex) {
// Support leading `(^|\G)` and similar
if (this.#data.strategy === 'search_or_line_start' && useLastIndex && this.lastIndex) {
// Reset since testing on a sliced string that we want to match at the start of
this.lastIndex = 0;
const match = exec.call(this, str.slice(pos));
if (match) {
Expand All @@ -89,7 +94,8 @@ class WrappedRegExp extends RegExp {
}
return match;
}
if (this.#strategy === 'not_search_start') {
// Support leading `(?!\G)`
if (this.#data.strategy === 'not_search_start') {
let match = exec.call(this, str);
if (match?.index === pos) {
match = exec.call(this, str.slice(1));
Expand All @@ -101,6 +107,29 @@ class WrappedRegExp extends RegExp {
}
return match;
}
// Support leading `(?<=\G|…)`
// Note: Leading `(?<=\G)` without other alts is supported without the need for a subclass
if (this.#data.strategy === 'after_search_start_or_subpattern') {
let match = exec.call(this, str);
if (!match) {
return match;
}
if (match.index === pos) {
// Satisfied `\G` in lookbehind
return match;
}
let globalRe = useLastIndex ? this : new RegExp(this, `g${this.flags}`);
const reBehind = new RegExp(`(?:${this.#data.subpattern})$`);
while (match) {
if (reBehind.exec(str.slice(0, match.index))) {
// Satisfied other alternative in lookbehind; return the main pattern's match
return match;
}
globalRe.lastIndex = match.index + 1;
match = exec.call(globalRe, str);
}
return match;
}
return exec.call(this, str);
}
}
Expand Down
61 changes: 46 additions & 15 deletions src/transform.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import emojiRegex from 'emoji-regex-xs';
import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, AstVariableLengthCharacterSetKinds, createAlternative, createBackreference, createGroup, createLookaround, createUnicodeProperty, isLookaround, parse} from './parse.js';
import {AstAssertionKinds, AstCharacterSetKinds, AstDirectiveKinds, AstTypes, AstVariableLengthCharacterSetKinds, createAlternative, createBackreference, createFlags, createGroup, createLookaround, createPattern, createRegex, createUnicodeProperty, isLookaround, parse} from './parse.js';
import {tokenize} from './tokenize.js';
import {traverse} from './traverse.js';
import {JsUnicodeProperties, PosixClassesMap} from './unicode.js';
Expand Down Expand Up @@ -543,8 +543,9 @@ function adoptAndSwapKids(parent, kids) {
}

function applySubclassStrategies(ast) {
// Special case handling for common patterns that are otherwise unsupportable; only one subclass
// strategy supported per pattern; see `WrappedRegExp` in `index.js`
// Special case handling that requires coupling with a `RegExp` subclass (see `WrappedRegExp` in
// `index.js`). These changes add emulation support for some common patterns that are otherwise
// unsupportable. Only one subclass strategy is supported per pattern
const alts = ast.pattern.alternatives;
const first = alts[0].elements[0];
if (alts.length !== 1 || !first) {
Expand All @@ -554,7 +555,7 @@ function applySubclassStrategies(ast) {
(first.type === AstTypes.CapturingGroup || first.type === AstTypes.Group) &&
first.alternatives.length === 1;
const firstIn = hasWrappingGroup ? first.alternatives[0].elements[0] : first;
// Strategy `start_of_search_or_line` adds support for leading `(^|\G)` and similar
// Subclass strategy `search_or_line_start`: Support leading `(^|\G)` and similar
if (
(firstIn.type === AstTypes.CapturingGroup || firstIn.type === AstTypes.Group) &&
firstIn.alternatives.length === 2 &&
Expand All @@ -573,20 +574,50 @@ function applySubclassStrategies(ast) {
} else {
firstIn.alternatives.shift();
}
return 'start_of_search_or_line';
return {
strategy: 'search_or_line_start',
};
}
}
// Strategy `not_search_start` adds support for leading `(?!\G)`
// Subclass strategy `not_search_start`: Support leading `(?!\G)`
if (
isLookaround(first) &&
first.negate &&
first.alternatives.length === 1 &&
first.alternatives[0].elements.length === 1 &&
first.alternatives[0].elements[0].kind === AstAssertionKinds.search_start
isLookaround(firstIn) &&
firstIn.negate &&
firstIn.alternatives.length === 1 &&
firstIn.alternatives[0].elements.length === 1 &&
firstIn.alternatives[0].elements[0].kind === AstAssertionKinds.search_start
) {
// Remove the negative lookahead
alts[0].elements.shift();
return 'not_search_start';
// Remove the lookahead
firstIn.parent.elements.shift();
return {
strategy: 'not_search_start',
};
}
// Subclass strategy `after_search_start_or_subpattern`: Support leading `(?<=\G|…)`
// Note: Leading `(?<=\G)` without other alts is supported without the need for a subclass
if (
isLookaround(firstIn) &&
!firstIn.negate &&
firstIn.alternatives.length > 1 &&
firstIn.alternatives[0].elements.length === 1 &&
firstIn.alternatives[0].elements[0].kind === AstAssertionKinds.search_start
) {
const siblingAlts = firstIn.alternatives.slice(1);
if (siblingAlts.some(alt => alt.elements.some(el => {
// Don't remove capturing groups from the tree because they can affect other nodes; keeping
// it simple to avoid recursively checking other group types
return isLookaround(el) || el.type === AstTypes.CapturingGroup || el.type === AstTypes.Group;
}))) {
throw new Error(r`Uses "\G" in a way that's unsupported for conversion to JS`);
}
const pattern = adoptAndSwapKids(createPattern(), siblingAlts);
const root = createRegex(pattern, createFlags(ast.flags));
// Remove the lookbehind
firstIn.parent.elements.shift();
return {
subtree: root,
strategy: 'after_search_start_or_subpattern',
};
}
return null;
}
Expand Down Expand Up @@ -763,7 +794,7 @@ function parseFragment(pattern, {bypassPropertyNameCheck} = {}) {
const ast = parse(tokenize(pattern), {bypassPropertyNameCheck});
const alts = ast.pattern.alternatives;
if (alts.length > 1 || alts[0].elements.length > 1) {
return adoptAndSwapKids(createGroup(), alts);;
return adoptAndSwapKids(createGroup(), alts);
}
return alts[0].elements[0];
}
Expand Down

0 comments on commit 3f6754a

Please sign in to comment.