From 2a57623cb01f37df38c770e87d96dd4a3ed0f3a1 Mon Sep 17 00:00:00 2001 From: Steven Levithan Date: Sat, 21 Dec 2024 23:52:59 +0100 Subject: [PATCH] Don't restrict Unicode properties based on target (closes #10) --- README.md | 29 ++++++++++++++--------------- src/generate.js | 6 +----- src/unicode.js | 21 --------------------- 3 files changed, 15 insertions(+), 41 deletions(-) diff --git a/README.md b/README.md index eee89fd..8a298c9 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Oniguruma-To-ES (鬼車➜ES) +# Oniguruma-To-ES (鬼車➡️ES) [![npm version][npm-version-src]][npm-version-href] [![npm downloads][npm-downloads-src]][npm-downloads-href] @@ -230,7 +230,7 @@ JavaScript version used for generated regexes. Using `auto` detects the best val More details - `ES2018`: Uses JS flag `u`. - - Emulation restrictions: Character class intersection, nested negated character classes, and Unicode properties added after ES2018 are not allowed. + - Emulation restrictions: Character class intersection and nested negated character classes are not allowed. - Generated regexes might use ES2018 features that require Node.js 10 or a browser version released during 2018 to 2023 (in Safari's case). Minimum requirement for any regex is Node.js 6 or a 2016-era browser. - `ES2024`: Uses JS flag `v`. - No emulation restrictions. @@ -515,7 +515,7 @@ Notice that nearly every feature below has at least subtle differences from Java \p{L},
\P{L} - ✅[1] + ✅ ✅ ✔ Binary properties
@@ -528,7 +528,7 @@ Notice that nearly every feature below has at least subtle differences from Java ✔ \p, \P without { is an identity escape
✔ Error for key prefixes
✔ Error for props of strings
- ❌ Blocks (wontfix[2])
+ ❌ Blocks (wontfix[1])
@@ -590,7 +590,7 @@ Notice that nearly every feature below has at least subtle differences from Java [[:word:]],
[[:^word:]] - ☑️[3] + ☑️[2] ✅ ✔ All use Unicode definitions
@@ -599,7 +599,7 @@ Notice that nearly every feature below has at least subtle differences from Java Nested class […[…]] - ☑️[4] + ☑️[3] ✅ ✔ Same as JS with flag v
@@ -800,7 +800,7 @@ Notice that nearly every feature below has at least subtle differences from Java ☑️ ☑️ - ✔ Error if group to the right[5]
+ ✔ Error if group to the right[4]
✔ Duplicate names (and subroutines) to the right not included in multiplex
✔ Fail to match (or don't include in multiplex) ancestor groups and groups in preceding alternation paths
❌ Some rare cases are indeterminable at compile time and use the JS behavior of matching an empty string
@@ -854,7 +854,7 @@ Notice that nearly every feature below has at least subtle differences from Java ☑️ ☑️ - ● Has depth limit[6]
+ ● Has depth limit[5]
@@ -867,7 +867,7 @@ Notice that nearly every feature below has at least subtle differences from Java ☑️ ☑️ - ● Has depth limit[6]
+ ● Has depth limit[5]
@@ -935,12 +935,11 @@ The table above doesn't include all aspects that Oniguruma-To-ES emulates (inclu ### Footnotes -1. Target `ES2018` doesn't allow using Unicode property names added in JavaScript specifications after ES2018. -2. Unicode blocks (which in Oniguruma are used with an `In…` prefix) are easily emulatable but their character data would significantly increase library weight. They're also a flawed and arguably unuseful feature, given the ability to use Unicode scripts and other properties. -3. With target `ES2018`, the specific POSIX classes `[:graph:]` and `[:print:]` use ASCII-based versions rather than the Unicode versions available for target `ES2024` and later, and they result in an error if using strict `accuracy`. -4. Target `ES2018` doesn't support nested *negated* character classes. -5. It's not an error for *numbered* backreferences to come before their referenced group in Oniguruma, but an error is the best path for Oniguruma-To-ES because (1) most placements are mistakes and can never match (based on the Oniguruma behavior for backreferences to nonparticipating groups), (2) erroring matches the behavior of named backreferences, and (3) the edge cases where they're matchable rely on rules for backreference resetting within quantified groups that are different in JavaScript and aren't emulatable. Note that it's not a backreference in the first place if using `\10` or higher and not as many capturing groups are defined to the left (it's an octal or identity escape). -6. The recursion depth limit is specified by option `maxRecursionDepth`. Overlapping recursions and the use of backreferences when the recursed subpattern contains captures aren't yet supported. Patterns that would error in Oniguruma due to triggering infinite recursion might find a match in Oniguruma-To-ES since recursion is bounded (future versions will detect this and error at transpilation time). +1. Unicode blocks (which in Oniguruma are used with an `In…` prefix) are easily emulatable but their character data would significantly increase library weight. They're also a flawed and arguably unuseful feature, given the ability to use Unicode scripts and other properties. +2. With target `ES2018`, the specific POSIX classes `[:graph:]` and `[:print:]` use ASCII-based versions rather than the Unicode versions available for target `ES2024` and later, and they result in an error if using strict `accuracy`. +3. Target `ES2018` doesn't support nested *negated* character classes. +4. It's not an error for *numbered* backreferences to come before their referenced group in Oniguruma, but an error is the best path for Oniguruma-To-ES because (1) most placements are mistakes and can never match (based on the Oniguruma behavior for backreferences to nonparticipating groups), (2) erroring matches the behavior of named backreferences, and (3) the edge cases where they're matchable rely on rules for backreference resetting within quantified groups that are different in JavaScript and aren't emulatable. Note that it's not a backreference in the first place if using `\10` or higher and not as many capturing groups are defined to the left (it's an octal or identity escape). +5. The recursion depth limit is specified by option `maxRecursionDepth`. Overlapping recursions and the use of backreferences when the recursed subpattern contains captures aren't yet supported. Patterns that would error in Oniguruma due to triggering infinite recursion might find a match in Oniguruma-To-ES since recursion is bounded (future versions will detect this and error at transpilation time). ## ❌ Unsupported features diff --git a/src/generate.js b/src/generate.js index 20fdd42..86c4af9 100644 --- a/src/generate.js +++ b/src/generate.js @@ -1,7 +1,7 @@ import {getOptions} from './options.js'; import {AstAssertionKinds, AstCharacterSetKinds, AstTypes} from './parse.js'; import {traverse} from './traverse.js'; -import {getIgnoreCaseMatchChars, JsUnicodePropertiesPostEs2018, UnicodePropertiesWithSpecificCase} from './unicode.js'; +import {getIgnoreCaseMatchChars, UnicodePropertiesWithSpecificCase} from './unicode.js'; import {cp, getNewCurrentFlags, isMinTarget, r} from './utils.js'; import {isLookaround} from './utils-node.js'; @@ -72,7 +72,6 @@ function generate(ast, options) { useDuplicateNames: minTargetEs2025, useFlagMods: minTargetEs2025, useFlagV: minTargetEs2024, - usePostEs2018Properties: minTargetEs2024, verbose: opts.verbose, }; function gen(node) { @@ -349,9 +348,6 @@ function genCharacterSet({kind, negate, value, key}, state) { return negate ? r`\D` : r`\d`; } if (kind === AstCharacterSetKinds.property) { - if (!state.usePostEs2018Properties && JsUnicodePropertiesPostEs2018.has(value)) { - throw new Error(`Unicode property "${value}" unavailable in target ES2018`); - } if ( state.useAppliedIgnoreCase && state.currentFlags.ignoreCase && diff --git a/src/unicode.js b/src/unicode.js index 410adc5..c6fd2b6 100644 --- a/src/unicode.js +++ b/src/unicode.js @@ -159,26 +159,6 @@ for (const p of JsUnicodePropertiesOfStrings) { JsUnicodePropertiesOfStringsMap.set(slug(p), p); } -// Unicode scripts and binary properties (and their aliases) added after ES2018 -// See -const JsUnicodePropertiesPostEs2018 = new Set(( - // ES2019 scripts - 'Dogr Dogra Gong Gunjala_Gondi Hanifi_Rohingya Maka Makasar Medefaidrin Medf Old_Sogdian Rohg Sogd Sogdian Sogo' + - // ES2019 binary properties - ' Extended_Pictographic' + - // ES2020 scripts - ' Elym Elymaic Hmnp Nand Nandinagari Nyiakeng_Puachue_Hmong Wancho Wcho' + - // ES2021 scripts - ' Chorasmian Chrs Diak Dives_Akuru Khitan_Small_Script Kits Yezi Yezidi' + - // ES2021 binary properties - ' EBase EComp EMod EPres ExtPict' + - // ES2022 scripts - ' Cpmn Cypro_Minoan Old_Uyghur Ougr Tangsa Tnsa Toto Vith Vithkuqi' + - // ES2023 scripts - ' Gara Garay Gukh Gurung_Khema Hrkt Katakana_Or_Hiragana Kawi Kirat_Rai Krai Nag_Mundari Nagm Ol_Onal Onao Sunu Sunuwar Todhri Todr Tulu_Tigalari Tutg Unknown Zzzz' - // ES2024: None, but added `JsUnicodePropertiesOfStrings` -).split(' ')); - const LowerToAlternativeLowerCaseMap = new Map([ ['s', cp(0x17F)], // s, ſ [cp(0x17F), 's'], // ſ, s @@ -291,7 +271,6 @@ export { JsUnicodeProperties, JsUnicodePropertiesMap, JsUnicodePropertiesOfStringsMap, - JsUnicodePropertiesPostEs2018, PosixClassesMap, PosixProperties, slug,