Skip to content

Commit

Permalink
Multibyte escapes
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Nov 3, 2024
1 parent 696dcab commit b6da11e
Show file tree
Hide file tree
Showing 6 changed files with 62 additions and 20 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,8 @@ Notice that nearly every feature below has at least subtle differences from Java
<td align="middle">✅</td>
<td>
✔ Allows 1 hex digit<br>
✔ Error for 2 hex digits > <code>7F</code><br>
✔ Above <code>7F</code>, is UTF-8 encoded byte (unlike JS)<br>
✔ Error for invalid encoded bytes<br>
</td>
</tr>
<tr valign="top">
Expand Down
2 changes: 2 additions & 0 deletions scripts/onig-compare.js
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ compare([
[r`\x{13FFFF}`, ``, r`Beyond Unicode range: JS doesn't support`],
[r`\x{140000}`, ``],
[r`\x{0 1}`, `\u{0}\u{1}`],
[r`\💖`, '💖'],
[`\\\u{10000}`, '\u{10000}'],
]);

async function compare(tests) {
Expand Down
1 change: 1 addition & 0 deletions spec/match-char-class.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ beforeEach(() => {
});

describe('CharacterClass', () => {
// TODO: Move to `match-char.spec.js`?
describe('Character', () => {
describe('escape', () => {
it('should match supported letter escapes', () => {
Expand Down
33 changes: 21 additions & 12 deletions spec/match-char.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ describe('Character', () => {
});

describe('identity escape', () => {
it('should match identity escapes', () => {
it('should match BMP identity escapes', () => {
const baseNonmetachars = [
'\0', '!', '~', ' ', '\n', 'E', 'm', '£', '\uFFFF',
];
Expand All @@ -84,12 +84,12 @@ describe('Character', () => {
}
});

it('should throw for multibyte escapes', () => {
const multibyte = [
it('should match astral identity escapes', () => {
const astral = [
'💖', '\u{10000}', '\u{10FFFF}',
];
for (const char of multibyte) {
expect(() => compile(`\\${char}`)).toThrow();
for (const char of astral) {
expect(char).toExactlyMatch(`\\${char}`);
}
});
});
Expand Down Expand Up @@ -160,23 +160,32 @@ describe('Character', () => {
expect('\u{A}').toExactlyMatch(r`\xa`);
});

it(r`should match hex char code with \xNN`, () => {
it(r`should match hex char code with \xNN up to 7F`, () => {
expect('\u{1}').toExactlyMatch(r`\x01`);
expect('\u{1}1').toExactlyMatch(r`\x011`);
expect('\u{A}').toExactlyMatch(r`\x0A`);
expect('\u{A}').toExactlyMatch(r`\x0a`);
expect('\u{7F}').toExactlyMatch(r`\x7F`);
});

it(r`should throw for incomplete \x`, () => {
expect(() => compile(r`\x`)).toThrow();
expect(() => compile(r`\x.`)).toThrow();
expect(() => compile(r`[\x]`)).toThrow();
it(r`should match hex char code UTF-8 encoded byte sequences \xNN (above 7F)`, () => {
expect('\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC`); // €
expect('\u{20AC}\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC\xE2\x82\xAC`); // €€
expect('\u{20AC}\u{7F}\u{20AC}').toExactlyMatch(r`\xE2\x82\xAC\x7F\xE2\x82\xAC`); // €€
expect('\u{9A69}').toExactlyMatch(r`\xE9\xA9\xA9`); // 驩
expect('\u{FEFF}').toExactlyMatch(r`\xEF\xBB\xBF`); // ZWNBSP/BOM
});

it(r`should throw for multibyte \xNN (above 7F)`, () => {
expect(() => compile(r`\x7F`)).not.toThrow();
it(r`should throw for invalid UTF-8 encoded byte sequences \xNN (above 7F)`, () => {
expect(() => compile(r`\x80`)).toThrow();
expect(() => compile(r`\xFF`)).toThrow();
expect(() => compile(r`\xEF\xC0\xBB`)).toThrow();
});

it(r`should throw for incomplete \x`, () => {
expect(() => compile(r`\x`)).toThrow();
expect(() => compile(r`\x.`)).toThrow();
expect(() => compile(r`[\x]`)).toThrow();
});

it(r`should match hex char code with \uNNNN`, () => {
Expand Down
1 change: 1 addition & 0 deletions src/generate.js
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ const CharCodeEscapeMap = new Map([
[13, r`\r`], // carriage return
[0x2028, r`\u2028`], // line separator
[0x2029, r`\u2029`], // paragraph separator
[0xFEFF, r`\uFEFF`], // ZWNBSP/BOM
]);

const casedRe = /^\p{Cased}$/u;
Expand Down
42 changes: 35 additions & 7 deletions src/tokenize.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ const EscapeCharCodes = new Map([
const controlCharPattern = 'c.? | C(?:-.?)?';
// Onig considers `\p` an identity escape, but e.g. `\p{`, `\p{ ^L}`, and `\p{gc=L}` are invalid
const unicodePropertyPattern = r`[pP]\{(?:\^?[\x20\w]+\})?`;
const encodedByteValuePattern = r`x[89A-Fa-f]\p{AHex}(?:\\x[89A-Fa-f]\p{AHex})*`;
const hexCharPattern = r`u(?:\p{AHex}{4})? | x\{[^\}]*\}? | x\p{AHex}{0,2}`;
const escapedNumPattern = r`\d{1,3}`;
const charClassOpenPattern = r`\[\^?\]?`;
Expand All @@ -71,6 +72,7 @@ const tokenRe = new RegExp(r`
\\ (?:
${controlCharPattern}
| ${unicodePropertyPattern}
| ${encodedByteValuePattern}
| ${hexCharPattern}
| ${escapedNumPattern}
| [gk]<[^>]*>
Expand All @@ -93,6 +95,7 @@ const charClassTokenRe = new RegExp(r`
\\ (?:
${controlCharPattern}
| ${unicodePropertyPattern}
| ${encodedByteValuePattern}
| ${hexCharPattern}
| ${escapedNumPattern}
| .
Expand Down Expand Up @@ -252,9 +255,8 @@ function getTokenWithDetails(context, pattern, m, lastIndex) {
};
}
// Run last since it assumes an identity escape as final condition
return {
token: createTokenForSharedEscape(m, {inCharClass: false}),
};
const result = createTokenForSharedEscape(m, {inCharClass: false});
return Array.isArray(result) ? {tokens: result} : {token: result};
}
if (m0 === '(') {
// Comment group
Expand Down Expand Up @@ -405,7 +407,12 @@ function getAllTokensForCharClass(pattern, opener, lastIndex) {
break;
}
} else {
tokens.push(createTokenForAnyTokenWithinCharClass(m));
const result = createTokenForAnyTokenWithinCharClass(m);
if (Array.isArray(result)) {
tokens.push(...result);
} else {
tokens.push(result);
}
}
}
return {
Expand Down Expand Up @@ -459,6 +466,27 @@ function createTokenForSharedEscape(raw, {inCharClass}) {
}
return createTokenForUnicodeProperty(raw);
}
// UTF-8 encoded byte sequence
if (/^\\x[89A-Fa-f]\p{AHex}/u.test(raw)) {
try {
const bytes = raw.split(/\\x/).slice(1).map(hex => parseInt(hex, 16));
const decoded = new TextDecoder('utf-8', {
ignoreBOM: true,
fatal: true,
}).decode(new Uint8Array(bytes));
const encoder = new TextEncoder();
const tokens = [...decoded].map(char => {
// Might have different casing for hex A-F than the input
const raw = [...encoder.encode(char)].map(byte => `\\x${byte.toString(16)}`).join('');
return createToken(TokenTypes.Character, raw, {
value: char.codePointAt(0),
});
});
return tokens;
} catch (err) {
throw new Error(`Too short or invalid multibyte code "${raw}"`);
}
}
if (char1 === 'u' || char1 === 'x') {
return createToken(TokenTypes.Character, raw, {
value: getValidatedHexCharCode(raw),
Expand All @@ -484,13 +512,13 @@ function createTokenForSharedEscape(raw, {inCharClass}) {
// [TODO] Supportable; see <https://github.com/kkos/oniguruma/blob/master/doc/SYNTAX.md#12-onig_syn_op2_esc_capital_m_bar_meta-enable-m-x>, <https://github.com/kkos/oniguruma/blob/43a8c3f3daf263091f3a74019d4b32ebb6417093/src/regparse.c#L4695>
throw new Error(`Unsupported meta "${raw}"`);
}
// Identity escape; count code unit length
if (raw.length === 2) {
// Identity escape; count code point length
if ([...raw].length === 2) {
return createToken(TokenTypes.Character, raw, {
value: raw.codePointAt(1),
});
}
throw new Error(`Invalid multibyte escape "${raw}"`);
throw new Error(`Unexpected escape "${raw}"`);
}

/**
Expand Down

0 comments on commit b6da11e

Please sign in to comment.