Skip to content

Commit

Permalink
Test character tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
slevithan committed Oct 28, 2024
1 parent 8ea1239 commit c2c13f3
Show file tree
Hide file tree
Showing 5 changed files with 237 additions and 11 deletions.
2 changes: 1 addition & 1 deletion dist/index.min.js

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions spec/match-character-class.spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import {r} from '../src/utils.js';
import {matchers} from './helpers/matchers.js';

beforeEach(() => {
jasmine.addMatchers(matchers);
});

describe('CharacterClass', () => {
describe('Character', () => {
describe('escape', () => {
it('should match supported letter escapes', () => {
expect('\x07').toMatchWithAllTargets(r`[\a]`);
expect('\x08').toMatchWithAllTargets(r`[\b]`);
expect('\x1B').toMatchWithAllTargets(r`[\e]`);
expect('\f').toMatchWithAllTargets(r`[\f]`);
expect('\n').toMatchWithAllTargets(r`[\n]`);
expect('\r').toMatchWithAllTargets(r`[\r]`);
expect('\t').toMatchWithAllTargets(r`[\t]`);
expect('\v').toMatchWithAllTargets(r`[\v]`);
});
});
// TODO: Rest
});
// TODO: Rest
});
1 change: 1 addition & 0 deletions spec/match-character-set.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ describe('CharacterSet', () => {
expect('\n').toMatchWithAllTargets({pattern: '.', flags: 'm'});
});
});
// TODO: Rest
});
202 changes: 202 additions & 0 deletions spec/match-character.spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
import {compile} from '../dist/index.mjs';
import {r} from '../src/utils.js';
import {matchers} from './helpers/matchers.js';

beforeEach(() => {
jasmine.addMatchers(matchers);
});

describe('Character', () => {
describe('literal', () => {
it('should match literal chars', () => {
expect('a').toMatchWithAllTargets('a');
expect('Multiple chars!').toMatchWithAllTargets('Multiple chars!');
});
});

describe('control', () => {
it(r`should match control char with \cx`, () => {
expect('\x01').toMatchWithAllTargets(r`\cA`);
expect('\x01').toMatchWithAllTargets(r`\ca`);
expect('\x1A').toMatchWithAllTargets(r`\cZ`);
expect('\x1A').toMatchWithAllTargets(r`\cz`);
});

it(r`should match control char with \C-x`, () => {
expect('\x01').toMatchWithAllTargets(r`\C-A`);
expect('\x01').toMatchWithAllTargets(r`\C-a`);
expect('\x1A').toMatchWithAllTargets(r`\C-Z`);
expect('\x1A').toMatchWithAllTargets(r`\C-z`);
});

// Currently unsupported: control chars other than A-Za-z
it('should throw for unsupported control char', () => {
expect(() => compile(r`\c.`)).toThrow();
expect(() => compile(r`\C-.`)).toThrow();
});

it(r`should throw for incomplete \c`, () => {
expect(() => compile(r`\c`)).toThrow();
});

it(r`should throw for incomplete \C`, () => {
expect(() => compile(r`\C`)).toThrow();
expect(() => compile(r`\C-`)).toThrow();
});
});

describe('escape', () => {
it('should match supported letter escapes', () => {
expect('\x07').toMatchWithAllTargets(r`\a`);
// `\b` supported in char class only
expect('\x1B').toMatchWithAllTargets(r`\e`);
expect('\f').toMatchWithAllTargets(r`\f`);
expect('\n').toMatchWithAllTargets(r`\n`);
expect('\r').toMatchWithAllTargets(r`\r`);
expect('\t').toMatchWithAllTargets(r`\t`);
expect('\v').toMatchWithAllTargets(r`\v`);
});
});

describe('escaped metacharacter', () => {
it('should match escaped metacharacters', () => {
const baseMetachars = [
'$', '(', ')', '*', '+', '.', '?', '[', '\\', ']', '^', '{', '|', '}',
];
for (const char of baseMetachars) {
expect(char).toMatchWithAllTargets(`\\${char}`);
}
});

it(`should throw for incomplete \\`, () => {
expect(() => compile(`\\`)).toThrow();
});
});

describe('identity escape', () => {
it('should match identity escapes', () => {
const baseUnspecial = [
'\0', '!', '~', ' ', '\n', 'E', 'm', '£', '\uFFFF',
];
for (const char of baseUnspecial) {
expect(char).toMatchWithAllTargets(`\\${char}`);
}
});

it('should throw for multibyte escapes', () => {
const multibyte = [
'💖', '\u{10000}', '\u{10FFFF}',
];
for (const char of multibyte) {
expect(() => compile(`\\${char}`)).toThrow();
}
});
});

describe('meta-code', () => {
it('should throw for unsupported meta-code', () => {
expect(() => compile(r`\M`)).toThrow();
expect(() => compile(r`\M-`)).toThrow();
// Currently unsupported
expect(() => compile(r`\M-\1`)).toThrow();
});

it('should throw for unsupported meta control char', () => {
expect(() => compile(r`\M-\C`)).toThrow();
expect(() => compile(r`\M-\C-`)).toThrow();
// Currently unsupported
expect(() => compile(r`\M-\C-A`)).toThrow();
});
});

describe('escaped number', () => {
it('should match null', () => {
expect('\0').toMatchWithAllTargets(r`\0`);
expect('\0').toMatchWithAllTargets(r`\00`);
expect('\0').toMatchWithAllTargets(r`\000`);
});

it('should match null followed by literal digits', () => {
expect('\u{0}0').toMatchWithAllTargets(r`\0000`);
expect('\u{0}1').toMatchWithAllTargets(r`\0001`);
});

it('should throw for invalid backrefs', () => {
for (let i = 1; i < 10; i++) {
// Escaped single digit 1-9 is always treated as a backref
expect(() => compile(`\\${i}`)).toThrow();
}
});

it('should match octals', () => {
expect('\u{1}').toMatchWithAllTargets(r`\01`);
expect('\u{1}').toMatchWithAllTargets(r`\001`);
expect(String.fromCodePoint(0o17)).toMatchWithAllTargets(r`\17`);
expect(String.fromCodePoint(0o777)).toMatchWithAllTargets(r`\777`);
});

it('should match octals followed by literal digits', () => {
expect('\u{0}1').toMatchWithAllTargets(r`\0001`);
expect(`${String.fromCodePoint(0o100)}0`).toMatchWithAllTargets(r`\1000`);
expect('\u{1}8').toMatchWithAllTargets(r`\18`);
expect('\u{1}9').toMatchWithAllTargets(r`\19`);
expect('\u{1}90').toMatchWithAllTargets(r`\190`);
expect(`${String.fromCodePoint(0o11)}8`).toMatchWithAllTargets(r`\118`);
expect(`${String.fromCodePoint(0o11)}9`).toMatchWithAllTargets(r`\119`);
expect(`${String.fromCodePoint(0o11)}90`).toMatchWithAllTargets(r`\1190`);
});

it('should match identity escape followed by literal digits', () => {
expect('80').toMatchWithAllTargets(r`\80`);
expect('90').toMatchWithAllTargets(r`\90`);
expect('900').toMatchWithAllTargets(r`\900`);
});
});

describe('unicode', () => {
it(r`should match hex char code with \xN`, () => {
expect('\u{1}').toMatchWithAllTargets(r`\x1`);
});

it(r`should match hex char code with \xNN`, () => {
expect('\u{1}').toMatchWithAllTargets(r`\x01`);
expect('\u{1}1').toMatchWithAllTargets(r`\x011`);
});

it(r`should throw for incomplete \x`, () => {
expect(() => compile(r`\x`)).toThrow();
expect(() => compile(r`\xG0`)).toThrow();
});

it(r`should match hex char code with \uNNNN`, () => {
expect('\x01').toMatchWithAllTargets(r`\x01`);
});

it(r`should throw for incomplete \u`, () => {
expect(() => compile(r`\u`)).toThrow();
expect(() => compile(r`\uG000`)).toThrow();
expect(() => compile(r`\u0`)).toThrow();
expect(() => compile(r`\u00`)).toThrow();
expect(() => compile(r`\u000`)).toThrow();
});

it(r`should match hex char code with \u{N...}`, () => {
expect('\u{1}').toMatchWithAllTargets(r`\u{1}`);
expect('\u{1}').toMatchWithAllTargets(r`\u{ 1}`);
expect('\u{1}').toMatchWithAllTargets(r`\u{1 }`);
expect('\u{1}').toMatchWithAllTargets(r`\u{ 1 }`);
expect('\u{1}').toMatchWithAllTargets(r`\u{01}`);
expect('\u{1}').toMatchWithAllTargets(r`\u{000001}`);
expect('\u{10FFFF}').toMatchWithAllTargets(r`\u{10FFFF}`);
});

it(r`should throw for incomplete or invalid \u{N...}`, () => {
expect(() => compile(r`\u{`)).toThrow();
expect(() => compile(r`\u{0`)).toThrow();
expect(() => compile(r`\u{0 0}`)).toThrow();
expect(() => compile(r`\u{G}`)).toThrow();
expect(() => compile(r`\u{0000001}`)).toThrow();
expect(() => compile(r`\u{110000}`)).toThrow();
});
});
});
18 changes: 8 additions & 10 deletions src/tokenize.js
Original file line number Diff line number Diff line change
Expand Up @@ -454,19 +454,18 @@ function createTokenForSharedEscape(raw, {inCharClass}) {
if (raw === '\\') {
throw new Error(r`Incomplete escape "\"`);
}
// Meta `\M-x` and meta control char `\M-\C-x` are unsupported; avoid treating as an identity escape
// Meta-code `\M-x` and `\M-\C-x` are unsupported; avoid treating as an identity escape
if (char1 === 'M') {
// [TODO] This can be supported fairly easily
throw new Error(`Unsupported meta escape "${raw}"`);
// [TODO] Supportable; see <https://github.com/kkos/oniguruma/blob/master/doc/SYNTAX.md#12-onig_syn_op2_esc_capital_m_bar_meta-enable-m-x>, <https://github.com/kkos/oniguruma/blob/43a8c3f3daf263091f3a74019d4b32ebb6417093/src/regparse.c#L4695>
throw new Error(`Unsupported meta-code "${raw}"`);
}
// Identity escape
// TODO: Should this count code point length instead?
// Identity escape; count code unit length
if (raw.length === 2) {
return createToken(TokenTypes.Character, raw, {
value: raw.codePointAt(1),
});
}
throw new Error(`Unexpected escape "${raw}"`);
throw new Error(`Invalid multibyte escape "${raw}"`);
}

/**
Expand All @@ -486,10 +485,9 @@ function createToken(type, raw, data) {
// Expects `\cx` or `\C-x`
function createTokenForControlChar(raw) {
const char = raw[1] === 'c' ? raw[2] : raw[3];
if (!char || !/[a-zA-Z]/.test(char)) {
// Unlike JS, Onig allows any char to follow `\c` (with special conversion rules), but this is
// an extreme edge case
// [TODO] This can be supported fairly easily
if (!char || !/[A-Za-z]/.test(char)) {
// Unlike JS, Onig allows any char to follow `\c` or `\C-`, but this is an extreme edge case
// [TODO] Supportable; see <https://github.com/kkos/oniguruma/blob/master/doc/SYNTAX.md#11-onig_syn_op2_esc_capital_c_bar_control-enable-c-x>, <https://github.com/kkos/oniguruma/blob/43a8c3f3daf263091f3a74019d4b32ebb6417093/src/regparse.c#L4695>
throw new Error(`Unsupported control character "${raw}"`);
}
return createToken(TokenTypes.Character, raw, {
Expand Down

0 comments on commit c2c13f3

Please sign in to comment.