From 8e5d94cc670b265f033aaecc5c593c0ca6e698b6 Mon Sep 17 00:00:00 2001 From: Steven Levithan Date: Sun, 3 Nov 2024 11:27:43 +0100 Subject: [PATCH] Fix edge case for literal hyphen --- README.md | 3 +- spec/helpers/features.js | 11 ++-- spec/helpers/matchers.js | 7 +-- spec/match-backreference.spec.js | 20 +++---- spec/match-char-class-intersection.spec.js | 1 + spec/match-char-class-range.spec.js | 62 ++++++++++++++++++++-- spec/match-char-class.spec.js | 1 + spec/match-char-set.spec.js | 4 +- src/parse.js | 4 +- src/tokenize.js | 2 +- 10 files changed, 89 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index d634935..6c0c9b1 100644 --- a/README.md +++ b/README.md @@ -488,8 +488,9 @@ Notice that nearly every feature below has at least subtle differences from Java ✅ ✅ - ✔ Unescaped - is literal char in some contexts (different than JS rules in any mode)
+ ✔ Unescaped - outside of range is literal in some contexts (different than JS rules in any mode)
✔ Fewer chars require escaping than JS
+ ✔ Error for reversed range (same as JS)
diff --git a/spec/helpers/features.js b/spec/helpers/features.js index 3f8390e..29dd4e7 100644 --- a/spec/helpers/features.js +++ b/spec/helpers/features.js @@ -6,7 +6,7 @@ const duplicateCaptureNamesSupported = (() => { } return true; })(); -const maxTargetForDuplicateNames = duplicateCaptureNamesSupported ? null : 'ES2024'; +const maxTestTargetForDuplicateNames = duplicateCaptureNamesSupported ? null : 'ES2024'; const patternModsSupported = (() => { try { @@ -16,9 +16,12 @@ const patternModsSupported = (() => { } return true; })(); -const maxTargetForPatternMods = patternModsSupported ? null : 'ES2024'; +const maxTestTargetForPatternMods = patternModsSupported ? null : 'ES2024'; + +const minTestTargetForFlagV = 'ES2024'; export { - maxTargetForDuplicateNames, - maxTargetForPatternMods, + maxTestTargetForDuplicateNames, + maxTestTargetForPatternMods, + minTestTargetForFlagV, }; diff --git a/spec/helpers/matchers.js b/spec/helpers/matchers.js index 215cc81..bbb9bb3 100644 --- a/spec/helpers/matchers.js +++ b/spec/helpers/matchers.js @@ -6,11 +6,12 @@ function getArgs(actual, expected) { pattern: typeof expected === 'string' ? expected : expected.pattern, flags: expected.flags ?? '', maxTarget: expected.maxTarget ?? null, + minTarget: expected.minTarget ?? null, }; const targets = ['ES2018', 'ES2024', 'ESNext']; - const targeted = opts.maxTarget ? - targets.filter(target => EsVersion[target] <= EsVersion[opts.maxTarget]) : - targets; + const targeted = targets. + filter(target => !opts.maxTarget || (EsVersion[target] <= EsVersion[opts.maxTarget])). + filter(target => !opts.minTarget || (EsVersion[target] >= EsVersion[opts.minTarget])); return { pattern: opts.pattern, flags: opts.flags, diff --git a/spec/match-backreference.spec.js b/spec/match-backreference.spec.js index 596899c..f4ef1d1 100644 --- a/spec/match-backreference.spec.js +++ b/spec/match-backreference.spec.js @@ -1,6 +1,6 @@ import {compile} from '../dist/index.mjs'; import {cp, r} from '../src/utils.js'; -import {maxTargetForDuplicateNames} from './helpers/features.js'; +import {maxTestTargetForDuplicateNames} from './helpers/features.js'; import {matchers} from './helpers/matchers.js'; beforeEach(() => { @@ -252,11 +252,11 @@ describe('Backreference', () => { expect('').not.toFindMatch(r`(?(?\k))`); expect('aa').toExactlyMatch({ pattern: r`(?a)\k|(?b\k)`, - maxTarget: maxTargetForDuplicateNames, + maxTarget: maxTestTargetForDuplicateNames, }); expect(['a', 'b', 'ba', 'bb']).not.toFindMatch({ pattern: r`(?a)\k|(?b\k)`, - maxTarget: maxTargetForDuplicateNames, + maxTarget: maxTestTargetForDuplicateNames, }); }); @@ -265,11 +265,11 @@ describe('Backreference', () => { expect('aba').toExactlyMatch(r`(?a)(?b\k)`); expect(['aa', 'bcb']).toExactlyMatch({ pattern: r`(?a)\k|(?b)(?c\k)`, - maxTarget: maxTargetForDuplicateNames, + maxTarget: maxTestTargetForDuplicateNames, }); expect(['a', 'bc', 'bca', 'bcc']).not.toFindMatch({ pattern: r`(?a)\k|(?b)(?c\k)`, - maxTarget: maxTargetForDuplicateNames, + maxTarget: maxTestTargetForDuplicateNames, }); }); @@ -297,7 +297,7 @@ describe('Backreference', () => { expect('aab').toExactlyMatch(r`(?a)\k(?b)`); expect('aa').toExactlyMatch({ pattern: r`(?a)\k|(?b)`, - maxTarget: maxTargetForDuplicateNames, + maxTarget: maxTestTargetForDuplicateNames, }); }); @@ -345,19 +345,19 @@ describe('Backreference', () => { // rather than JS logic where they match the empty string expect(['aa', 'bb']).toExactlyMatch({ pattern: r`(?a)\k|(?b)\k`, - maxTarget: maxTargetForDuplicateNames, + maxTarget: maxTestTargetForDuplicateNames, }); expect(['a', 'b', 'ba']).not.toFindMatch({ pattern: r`(?a)\k|(?b)\k`, - maxTarget: maxTargetForDuplicateNames, + maxTarget: maxTestTargetForDuplicateNames, }); expect(['aa', 'bcb', 'bcc']).toExactlyMatch({ pattern: r`(?a)\k|(?b)(?c)\k`, - maxTarget: maxTargetForDuplicateNames, + maxTarget: maxTestTargetForDuplicateNames, }); expect(['a', 'bc', 'bca']).not.toFindMatch({ pattern: r`(?a)\k|(?b)(?c)\k`, - maxTarget: maxTargetForDuplicateNames, + maxTarget: maxTestTargetForDuplicateNames, }); }); diff --git a/spec/match-char-class-intersection.spec.js b/spec/match-char-class-intersection.spec.js index d72e342..9ceedfe 100644 --- a/spec/match-char-class-intersection.spec.js +++ b/spec/match-char-class-intersection.spec.js @@ -6,6 +6,7 @@ beforeEach(() => { }); // TODO: Add me +// TODO: Test that it throws for target ES2018 describe('CharacterClassIntersection', () => { it('should', () => { diff --git a/spec/match-char-class-range.spec.js b/spec/match-char-class-range.spec.js index 01f2d80..ee4f4f6 100644 --- a/spec/match-char-class-range.spec.js +++ b/spec/match-char-class-range.spec.js @@ -1,14 +1,68 @@ import {r} from '../src/utils.js'; +import {minTestTargetForFlagV} from './helpers/features.js'; import {matchers} from './helpers/matchers.js'; beforeEach(() => { jasmine.addMatchers(matchers); }); -// TODO: Add me - describe('CharacterClassRange', () => { - it('should', () => { - expect('').toExactlyMatch(r``); + it('should match any char from range', () => { + expect(['a', 'b', 'c']).toExactlyMatch(r`[a-c]`); + expect('d').not.toFindMatch(r`[a-c]`); + }); + + it('should match unescaped hyphen as literal at start of class', () => { + expect('-').toExactlyMatch(r`[-a]`); + expect('-').toExactlyMatch(r`[-\w]`); + expect('-').not.toFindMatch(r`[^-a]`); + expect('-').toExactlyMatch(r`[^[^-a]]`); + expect('-').toExactlyMatch(r`[a[-b]]`); + expect('-').toExactlyMatch(r`[-[ab]]`); + }); + + it('should match unescaped hyphen as literal at end of class', () => { + expect('-').toExactlyMatch(r`[a-]`); + expect('-').toExactlyMatch(r`[\w-]`); + expect('-').toExactlyMatch(r`[a[b-]]`); + expect('-').toExactlyMatch(r`[a[bc]-]`); + }); + + it('should match unescaped hyphen as literal at intersection boundary', () => { + expect('-').toExactlyMatch({ + pattern: r`[a-&&\p{Any}]`, + minTarget: minTestTargetForFlagV, + }); + expect('-').toExactlyMatch({ + pattern: r`[\w-&&\p{Any}]`, + minTarget: minTestTargetForFlagV, + }); + expect('-').toExactlyMatch({ + pattern: r`[\p{Any}&&-a]`, + minTarget: minTestTargetForFlagV, + }); + expect('-').toExactlyMatch({ + pattern: r`[\p{Any}&&-\w]`, + minTarget: minTestTargetForFlagV, + }); + }); + + it('should match unescaped hyphen as literal at right of range', () => { + expect('-').toExactlyMatch(r`[a-z-0]`); + expect('-').toExactlyMatch(r`[a-z-\w]`); + expect('-').toExactlyMatch(r`[a-z-0-9]`); + }); + + it('should throw for reversed ranges', () => { + expect(() => compile(r`[z-a]`)).toThrow(); + expect(() => compile(r`[\u{1}-\0]`)).toThrow(); + }); + + it('should throw for range with set', () => { + expect(() => compile(r`[a-\w]`)).toThrow(); + expect(() => compile(r`[\w-a]`)).toThrow(); + expect(() => compile(r`[\w-a-z]`)).toThrow(); + expect(() => compile(r`[a-z-\w]`)).toThrow(); + expect(() => compile(r`[\w-\s]`)).toThrow(); }); }); diff --git a/spec/match-char-class.spec.js b/spec/match-char-class.spec.js index 6a8e709..95d237a 100644 --- a/spec/match-char-class.spec.js +++ b/spec/match-char-class.spec.js @@ -66,4 +66,5 @@ describe('CharacterClass', () => { }); // TODO: Add remaining + // TODO: Test that nested negated classes throw for target ES2018 }); diff --git a/spec/match-char-set.spec.js b/spec/match-char-set.spec.js index 4474725..18a21a6 100644 --- a/spec/match-char-set.spec.js +++ b/spec/match-char-set.spec.js @@ -1,5 +1,5 @@ import {r} from '../src/utils.js'; -import {maxTargetForPatternMods} from './helpers/features.js'; +import {maxTestTargetForPatternMods} from './helpers/features.js'; import {matchers} from './helpers/matchers.js'; beforeEach(() => { @@ -17,7 +17,7 @@ describe('CharacterSet', () => { it('should match line feed with flag m disabled', () => { expect('\n').toExactlyMatch({ pattern: r`(?-m)\O`, - maxTarget: maxTargetForPatternMods, + maxTarget: maxTestTargetForPatternMods, }); }); diff --git a/src/parse.js b/src/parse.js index 3ac34c0..4ceae6a 100644 --- a/src/parse.js +++ b/src/parse.js @@ -220,6 +220,7 @@ function parseCharacterClassHyphen(context, state) { if ( prevSiblingNode && prevSiblingNode.type !== AstTypes.CharacterClass && + prevSiblingNode.type !== AstTypes.CharacterClassRange && nextToken && nextToken.type !== TokenTypes.CharacterClassOpen && nextToken.type !== TokenTypes.CharacterClassClose && @@ -558,7 +559,8 @@ function createPattern() { } function createQuantifier(element, min, max, greedy, possessive) { - // TODO: Move validation to tokenizer? + // Could be checked in the tokenizer, but done here to parallel char class range validation and + // to prevent manually creating invalid quantifiers if (max < min) { throw new Error('Quantifier range out of order'); } diff --git a/src/tokenize.js b/src/tokenize.js index eb6dcd7..319c3bb 100644 --- a/src/tokenize.js +++ b/src/tokenize.js @@ -476,7 +476,7 @@ function createTokenForSharedEscape(raw, {inCharClass}) { }).decode(new Uint8Array(bytes)); const encoder = new TextEncoder(); const tokens = [...decoded].map(char => { - // Might have different casing for hex A-F than the input + // Since this regenerates `raw`, it might have different casing for hex A-F than the input const raw = [...encoder.encode(char)].map(byte => `\\x${byte.toString(16)}`).join(''); return createToken(TokenTypes.Character, raw, { value: char.codePointAt(0),